refactor to use Peewee objects extension in place of redefining every SPIP atribute
This commit is contained in:
parent
4d269357de
commit
13fa720562
@ -3,7 +3,7 @@ from re import I, S, compile, finditer, sub
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
# SPIP syntax to Markdown
|
# SPIP syntax to Markdown
|
||||||
spip_to_markdown = (
|
SPIP_TO_MARKDOWN = (
|
||||||
( # horizontal rule
|
( # horizontal rule
|
||||||
compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I),
|
compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I),
|
||||||
# r"---",
|
# r"---",
|
||||||
@ -40,6 +40,14 @@ spip_to_markdown = (
|
|||||||
),
|
),
|
||||||
r"~\1~",
|
r"~\1~",
|
||||||
),
|
),
|
||||||
|
( # images
|
||||||
|
compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I),
|
||||||
|
r"![](\1\2)",
|
||||||
|
),
|
||||||
|
( # documents & embeds
|
||||||
|
compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I),
|
||||||
|
r"[](\1\2)",
|
||||||
|
),
|
||||||
( # anchor
|
( # anchor
|
||||||
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
|
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
|
||||||
r"[\1](\2)",
|
r"[\1](\2)",
|
||||||
@ -100,58 +108,20 @@ spip_to_markdown = (
|
|||||||
),
|
),
|
||||||
r"\1",
|
r"\1",
|
||||||
),
|
),
|
||||||
)
|
( # WARNING remove every html tag
|
||||||
|
compile(r"<\/?.*?>\s*", S | I),
|
||||||
spip_to_text = (
|
|
||||||
( # strong
|
|
||||||
compile(r"\{\{ *(.*?) *\}\}", S | I),
|
|
||||||
r"\1",
|
|
||||||
),
|
|
||||||
( # html strong
|
|
||||||
compile(r"<strong> *(.*?) *</strong>", S | I),
|
|
||||||
r"\1",
|
|
||||||
),
|
|
||||||
( # emphasis
|
|
||||||
compile(r"\{ *(.*?) *\}", S | I),
|
|
||||||
r"\1",
|
|
||||||
),
|
|
||||||
( # html emphasis
|
|
||||||
compile(r"<i> *(.*?) *<\/i>", S | I),
|
|
||||||
r"\1",
|
|
||||||
),
|
|
||||||
( # strikethrough
|
|
||||||
compile(
|
|
||||||
r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
|
|
||||||
S | I,
|
|
||||||
),
|
|
||||||
r"\1",
|
|
||||||
),
|
|
||||||
( # Keep only the first language in multi-language blocks
|
|
||||||
compile(
|
|
||||||
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
|
|
||||||
S | I,
|
|
||||||
),
|
|
||||||
r"\1",
|
|
||||||
),
|
|
||||||
( # remove every html tag
|
|
||||||
compile(r"<\/?.*?> *", S | I),
|
|
||||||
r"",
|
|
||||||
),
|
|
||||||
( # Remove beginning with angle bracket(s)
|
|
||||||
compile(r"^>+ +", S | I),
|
|
||||||
r"",
|
|
||||||
),
|
|
||||||
( # Remove beginning with a number followed by a dot
|
|
||||||
compile(r"^\d+\. +", S | I),
|
|
||||||
r"",
|
r"",
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
# HTML tag WARNING can be used to remove them all
|
# Further cleaning for metadata texts such as titles or descriptions
|
||||||
html_tag = compile(r"<\/?.*?> *", S | I)
|
SPIP_META_BLOAT = (
|
||||||
|
compile(r"^>+ +", S | I), # Remove beginning with angle bracket(s)
|
||||||
|
compile(r"^\d+\. +", S | I), # Remove beginning with a number followed by a dot
|
||||||
|
)
|
||||||
|
|
||||||
# Broken ISO encoding to proper UTF-8
|
# Broken ISO encoding to proper UTF-8
|
||||||
iso_to_utf = (
|
ISO_TO_UTF = (
|
||||||
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
|
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
|
||||||
"’",
|
"’",
|
||||||
r"’",
|
r"’",
|
||||||
@ -264,82 +234,71 @@ iso_to_utf = (
|
|||||||
)
|
)
|
||||||
|
|
||||||
# WARNING unknown broken encoding
|
# WARNING unknown broken encoding
|
||||||
unknown_iso = (
|
UNKNOWN_ISO = (
|
||||||
r"
",
|
r"
",
|
||||||
r"∆",
|
r"∆",
|
||||||
r"û",
|
r"û",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# Apply spip_to_markdown conversions to a text
|
# Apply SPIP to Markdown & ISO to UTF conversions to a text, & eventually clean meta
|
||||||
def convert_body(text: Optional[str]) -> str:
|
def convert(text: Optional[str], clean_meta: bool = False) -> str:
|
||||||
if text is None:
|
if text is None:
|
||||||
return ""
|
return ""
|
||||||
for spip, markdown in spip_to_markdown:
|
for spip, markdown in SPIP_TO_MARKDOWN:
|
||||||
text = spip.sub(markdown, text)
|
text = spip.sub(markdown, text)
|
||||||
for iso, utf in iso_to_utf:
|
if clean_meta:
|
||||||
|
for bloat in SPIP_META_BLOAT:
|
||||||
|
text = bloat.sub("", text)
|
||||||
|
for iso, utf in ISO_TO_UTF:
|
||||||
text = text.replace(iso, utf)
|
text = text.replace(iso, utf)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
# Apply spip_to_text conversions to a text
|
# Replace images & files links in Markdown with real slugs of the actually linked files
|
||||||
def convert_meta(text: Optional[str]) -> str:
|
def link_documents(text: str, documents: list[tuple[int, str, str]]) -> str:
|
||||||
if text is None:
|
|
||||||
return ""
|
|
||||||
for spip, metadata in spip_to_text:
|
|
||||||
text = spip.sub(metadata, text)
|
|
||||||
for iso, utf in iso_to_utf:
|
|
||||||
text = text.replace(iso, utf)
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
# Replace images & documents in SPIP text with Markdown links with human-readable names
|
|
||||||
def convert_documents(text: str, documents: list[tuple[int, str, str]]) -> str:
|
|
||||||
for id, name, slug in documents:
|
for id, name, slug in documents:
|
||||||
|
# Replace images that dont have a title written in text
|
||||||
text = sub(
|
text = sub(
|
||||||
r"<(?:img|image)" + str(id) + r"(\|.*?)*>",
|
r"\[]\((?:img|image)" + str(id) + r"(\|.*?)*\)",
|
||||||
f"![{name}]({slug})",
|
f"![{name}]({slug})",
|
||||||
text,
|
text,
|
||||||
)
|
)
|
||||||
|
# Replace images that dont have a title written in text
|
||||||
text = sub(
|
text = sub(
|
||||||
r"<(?:doc|emb)" + str(id) + r"(\|.*?)*>",
|
r"\[]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)",
|
||||||
f"[{name}]({slug})",
|
f"[{name}]({slug})",
|
||||||
text,
|
text,
|
||||||
)
|
)
|
||||||
|
# Replace images that already had a title in Markdown style link
|
||||||
text = sub(
|
text = sub(
|
||||||
r"\[(.*?)\]\((?:doc|emb)" + str(id) + r"(\|.*?)*\)",
|
r"\[(.+?)\]\((?:img|image)" + str(id) + r"(\|.*?)*\)",
|
||||||
|
f"![\\1]({slug})",
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
# Replace documents that already had a title in Markdown style link
|
||||||
|
text = sub(
|
||||||
|
r"\[(.+?)\]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)",
|
||||||
f"[\\1]({slug})",
|
f"[\\1]({slug})",
|
||||||
text,
|
text,
|
||||||
)
|
)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
# Replace unknown chars with empty strings (delete them)
|
|
||||||
def remove_unknown_chars(text: str) -> str:
|
|
||||||
for char in unknown_iso:
|
|
||||||
text.replace(char, "")
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
# Replace HTML tags chars with empty strings (delete them)
|
|
||||||
def remove_tags(text: str) -> str:
|
|
||||||
return html_tag.sub("", text)
|
|
||||||
|
|
||||||
|
|
||||||
# Return a list of tuples giving the start and end of unknown substring in text
|
# Return a list of tuples giving the start and end of unknown substring in text
|
||||||
def unknown_chars(text: str) -> list[tuple[int, int]]:
|
def unknown_chars(text: str) -> list[tuple[int, int]]:
|
||||||
positions: list[tuple[int, int]] = []
|
positions: list[tuple[int, int]] = []
|
||||||
for char in unknown_iso:
|
for char in UNKNOWN_ISO:
|
||||||
for match in finditer("(" + char + ")+", text):
|
for match in finditer("(" + char + ")+", text):
|
||||||
positions.append((match.start(), match.end()))
|
positions.append((match.start(), match.end()))
|
||||||
return positions
|
return positions
|
||||||
|
|
||||||
|
|
||||||
# Return strings with unknown chards found in text, surrounded by context_length chars
|
# Return strings with unknown chards found in text, surrounded by context_length chars
|
||||||
def get_unknown_chars(text: str, context_length: int = 20) -> list[str]:
|
def unknown_chars_context(text: str, context_length: int = 20) -> list[str]:
|
||||||
errors: list[str] = []
|
errors: list[str] = []
|
||||||
context: str = r".{0," + str(context_length) + r"}"
|
context: str = r".{0," + str(context_length) + r"}"
|
||||||
for char in unknown_iso:
|
for char in UNKNOWN_ISO:
|
||||||
matches = finditer(
|
matches = finditer(
|
||||||
context + r"(?=" + char + r")" + char + r".*?(?=\r?\n|$)",
|
context + r"(?=" + char + r")" + char + r".*?(?=\r?\n|$)",
|
||||||
text,
|
text,
|
@ -1,4 +1,3 @@
|
|||||||
# pyright: basic
|
|
||||||
# type: ignore
|
# type: ignore
|
||||||
from peewee import (
|
from peewee import (
|
||||||
SQL,
|
SQL,
|
||||||
@ -15,7 +14,7 @@ from peewee import (
|
|||||||
TextField,
|
TextField,
|
||||||
)
|
)
|
||||||
|
|
||||||
db = MySQLDatabase(None)
|
DB = MySQLDatabase(None)
|
||||||
|
|
||||||
|
|
||||||
# class UnknownField(object):
|
# class UnknownField(object):
|
||||||
@ -25,7 +24,7 @@ db = MySQLDatabase(None)
|
|||||||
|
|
||||||
class BaseModel(Model):
|
class BaseModel(Model):
|
||||||
class Meta:
|
class Meta:
|
||||||
database: MySQLDatabase = db
|
database: MySQLDatabase = DB
|
||||||
|
|
||||||
|
|
||||||
class SpipArticles(BaseModel):
|
class SpipArticles(BaseModel):
|
||||||
|
259
spip2md/items.py
259
spip2md/items.py
@ -1,259 +0,0 @@
|
|||||||
# pyright: strict
|
|
||||||
from os.path import basename, splitext
|
|
||||||
from typing import Any, Optional
|
|
||||||
|
|
||||||
from slugify import slugify
|
|
||||||
from yaml import dump
|
|
||||||
|
|
||||||
from converter import convert_body, convert_documents, convert_meta, remove_tags
|
|
||||||
from database import (
|
|
||||||
SpipArticles,
|
|
||||||
SpipAuteurs,
|
|
||||||
SpipAuteursLiens,
|
|
||||||
SpipDocuments,
|
|
||||||
SpipDocumentsLiens,
|
|
||||||
SpipRubriques,
|
|
||||||
)
|
|
||||||
|
|
||||||
EXPORTTYPE: str = "md"
|
|
||||||
|
|
||||||
|
|
||||||
class Iterator:
|
|
||||||
items: list[Any]
|
|
||||||
|
|
||||||
def __init__(self) -> None:
|
|
||||||
# Set the limit at the number of retrieved items
|
|
||||||
self.LIMIT: int = len(self.items)
|
|
||||||
# Start before the first element
|
|
||||||
self.count: int = -1
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __len__(self) -> int:
|
|
||||||
return self.LIMIT
|
|
||||||
|
|
||||||
def remaining(self) -> int:
|
|
||||||
return self.LIMIT - self.count
|
|
||||||
|
|
||||||
def __next__(self) -> Any:
|
|
||||||
self.count += 1
|
|
||||||
if self.remaining() <= 0:
|
|
||||||
raise StopIteration
|
|
||||||
return self.items[self.count]
|
|
||||||
|
|
||||||
|
|
||||||
class Document:
|
|
||||||
def __init__(self, document: SpipDocuments) -> None:
|
|
||||||
self.id: int = document.id_document
|
|
||||||
self.thumbnail_id: int = document.id_vignette
|
|
||||||
self.title: str = convert_meta(document.titre)
|
|
||||||
self.date: str = document.date
|
|
||||||
self.description: str = convert_meta(document.descriptif)
|
|
||||||
self.file: str = document.fichier
|
|
||||||
self.draft: bool = document.statut == "publie"
|
|
||||||
self.creation: str = document.date
|
|
||||||
self.publication: str = document.date_publication
|
|
||||||
self.update: str = document.maj
|
|
||||||
self.media: str = document.media
|
|
||||||
|
|
||||||
def get_slug(self, date: bool = False) -> str:
|
|
||||||
name_type = splitext(basename(self.file))
|
|
||||||
return (
|
|
||||||
slugify((self.publication + "-" if date else "") + name_type[0])
|
|
||||||
+ name_type[1]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class Documents(Iterator):
|
|
||||||
def __init__(self, object_id: int) -> None:
|
|
||||||
# Query the DB to retrieve all documents related to object of id object_id
|
|
||||||
items = (
|
|
||||||
SpipDocuments.select()
|
|
||||||
.join(
|
|
||||||
SpipDocumentsLiens,
|
|
||||||
on=(SpipDocuments.id_document == SpipDocumentsLiens.id_document),
|
|
||||||
)
|
|
||||||
.where(SpipDocumentsLiens.id_objet == object_id)
|
|
||||||
)
|
|
||||||
self.items: list[Document] = [Document(i) for i in items]
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
|
|
||||||
class Item:
|
|
||||||
id: int
|
|
||||||
|
|
||||||
def __init__(self, item: SpipArticles | SpipRubriques):
|
|
||||||
self.title: str = convert_meta(item.titre)
|
|
||||||
self.section_id: int = item.id_rubrique
|
|
||||||
self.description: str = convert_meta(item.descriptif)
|
|
||||||
self.text: str = convert_body(item.texte) # Convert SPIP to Markdown
|
|
||||||
self.publication: str = item.date
|
|
||||||
self.draft: bool = item.statut == "publie"
|
|
||||||
self.sector_id: int = item.id_secteur
|
|
||||||
self.update: str = item.maj
|
|
||||||
self.lang: str = item.lang
|
|
||||||
self.set_lang: bool = item.langue_choisie == "oui" # TODO Why ?
|
|
||||||
self.translation_key: int = item.id_trad
|
|
||||||
self.extra: str = convert_body(item.extra) # Probably unused
|
|
||||||
|
|
||||||
def get_slug(self, date: bool = False) -> str:
|
|
||||||
return slugify((self.publication + "-" if date else "") + self.title)
|
|
||||||
|
|
||||||
def get_filename(self) -> str:
|
|
||||||
return "index" + "." + self.lang + "." + EXPORTTYPE
|
|
||||||
|
|
||||||
def get_frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
|
|
||||||
return dump(
|
|
||||||
{
|
|
||||||
"lang": self.lang,
|
|
||||||
"translationKey": self.translation_key,
|
|
||||||
"title": self.title,
|
|
||||||
"publishDate": self.publication,
|
|
||||||
"lastmod": self.update,
|
|
||||||
"draft": self.draft,
|
|
||||||
"description": self.description,
|
|
||||||
# Debugging
|
|
||||||
"spip_id": self.id,
|
|
||||||
"spip_id_secteur": self.sector_id,
|
|
||||||
}
|
|
||||||
| append
|
|
||||||
if append is not None
|
|
||||||
else {},
|
|
||||||
allow_unicode=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_body(self) -> str:
|
|
||||||
body: str = ""
|
|
||||||
# Add the title as a Markdown h1
|
|
||||||
if len(self.title) > 0:
|
|
||||||
body += "\n\n# " + self.title
|
|
||||||
# If there is a text, add the text preceded by two line breaks
|
|
||||||
if len(self.text) > 0:
|
|
||||||
# Convert images & files links
|
|
||||||
text: str = convert_documents(
|
|
||||||
self.text,
|
|
||||||
[(d.id, d.title, d.get_slug()) for d in self.get_documents()],
|
|
||||||
)
|
|
||||||
# Remove remaining HTML after & append to body
|
|
||||||
body += "\n\n" + remove_tags(text)
|
|
||||||
# Same with an "extra" section
|
|
||||||
if len(self.extra) > 0:
|
|
||||||
body += "\n\n# EXTRA\n\n" + self.extra
|
|
||||||
return body
|
|
||||||
|
|
||||||
def get_content(self) -> str:
|
|
||||||
# Return the final article text
|
|
||||||
return "---\n" + self.get_frontmatter() + "---" + self.get_body()
|
|
||||||
|
|
||||||
def get_documents(self) -> Documents:
|
|
||||||
return Documents(self.id)
|
|
||||||
|
|
||||||
|
|
||||||
class Article(Item):
|
|
||||||
def __init__(self, article: SpipArticles):
|
|
||||||
super().__init__(article)
|
|
||||||
self.id: int = article.id_article
|
|
||||||
self.surtitle: str = convert_meta(article.surtitre) # Probably unused
|
|
||||||
self.subtitle: str = convert_meta(article.soustitre) # Probably unused
|
|
||||||
self.caption: str = convert_body(article.chapo) # Probably unused
|
|
||||||
self.ps: str = convert_body(article.ps) # Probably unused
|
|
||||||
self.update_2: str = article.date_modif # Probably unused duplicate of maj
|
|
||||||
self.creation: str = article.date_redac
|
|
||||||
self.forum: bool = article.accepter_forum == "oui" # TODO Why ?
|
|
||||||
self.sitename: str = article.nom_site # Probably useless
|
|
||||||
self.virtual: str = article.virtuel # TODO Why ?
|
|
||||||
self.microblog: str = article.microblog # Probably unused
|
|
||||||
# self.export = article.export # USELESS
|
|
||||||
# self.views: int = article.visites # USELESS in static
|
|
||||||
# self.referers: int = article.referers # USELESS in static
|
|
||||||
# self.popularity: float = article.popularite # USELESS in static
|
|
||||||
# self.version = article.id_version # USELESS
|
|
||||||
|
|
||||||
def get_authors(self) -> list[SpipAuteurs]:
|
|
||||||
return (
|
|
||||||
SpipAuteurs.select()
|
|
||||||
.join(
|
|
||||||
SpipAuteursLiens,
|
|
||||||
on=(SpipAuteurs.id_auteur == SpipAuteursLiens.id_auteur),
|
|
||||||
)
|
|
||||||
.where(SpipAuteursLiens.id_objet == self.id)
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
|
|
||||||
return super().get_frontmatter(
|
|
||||||
{
|
|
||||||
"surtitle": self.surtitle,
|
|
||||||
"subtitle": self.subtitle,
|
|
||||||
"date": self.creation,
|
|
||||||
"authors": [author.nom for author in self.get_authors()],
|
|
||||||
# Debugging
|
|
||||||
"spip_id_rubrique": self.section_id,
|
|
||||||
"spip_id_secteur": self.sector_id,
|
|
||||||
"spip_chapo": self.caption,
|
|
||||||
}
|
|
||||||
| append
|
|
||||||
if append is not None
|
|
||||||
else {},
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_body(self) -> str:
|
|
||||||
body: str = super().get_body()
|
|
||||||
# If there is a caption, add the caption followed by a hr
|
|
||||||
if hasattr(self, "caption") and len(self.caption) > 0:
|
|
||||||
body += "\n\n" + self.caption + "\n\n***"
|
|
||||||
# PS
|
|
||||||
if hasattr(self, "ps") and len(self.ps) > 0:
|
|
||||||
body += "\n\n# POST-SCRIPTUM\n\n" + self.ps
|
|
||||||
# Microblog
|
|
||||||
if hasattr(self, "microblog") and len(self.microblog) > 0:
|
|
||||||
body += "\n\n# MICROBLOGGING\n\n" + self.microblog
|
|
||||||
return body
|
|
||||||
|
|
||||||
|
|
||||||
class Section(Item):
|
|
||||||
def __init__(self, section: SpipRubriques):
|
|
||||||
super().__init__(section)
|
|
||||||
self.id: int = section.id_rubrique
|
|
||||||
self.parent_id: int = section.id_parent
|
|
||||||
self.depth: int = section.profondeur
|
|
||||||
self.agenda: int = section.agenda
|
|
||||||
|
|
||||||
def get_filename(self) -> str:
|
|
||||||
return "_" + super().get_filename()
|
|
||||||
|
|
||||||
def get_articles(self, limit: int = 0):
|
|
||||||
return Articles(self.id, limit)
|
|
||||||
|
|
||||||
|
|
||||||
class Articles(Iterator):
|
|
||||||
def __init__(self, section_id: int, limit: int = 0):
|
|
||||||
# Query the DB to retrieve all articles sorted by publication date
|
|
||||||
if limit > 0:
|
|
||||||
items = (
|
|
||||||
SpipArticles.select()
|
|
||||||
.where(SpipArticles.id_rubrique == section_id)
|
|
||||||
.order_by(SpipArticles.date.desc())
|
|
||||||
.limit(limit)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
items = (
|
|
||||||
SpipArticles.select()
|
|
||||||
.where(SpipArticles.id_rubrique == section_id)
|
|
||||||
.order_by(SpipArticles.date.desc())
|
|
||||||
)
|
|
||||||
self.items: list[Article] = [Article(i) for i in items]
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
|
|
||||||
class Sections(Iterator):
|
|
||||||
def __init__(self, limit: int = 0):
|
|
||||||
# Query the DB to retrieve all sections sorted by publication date
|
|
||||||
if limit > 0:
|
|
||||||
items = (
|
|
||||||
SpipRubriques.select().order_by(SpipRubriques.date.desc()).limit(limit)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
items = SpipRubriques.select().order_by(SpipRubriques.date.desc())
|
|
||||||
self.items: list[Section] = [Section(i) for i in items]
|
|
||||||
super().__init__()
|
|
@ -1,18 +1,19 @@
|
|||||||
#!python
|
#!python
|
||||||
# pyright: strict
|
|
||||||
from os import makedirs
|
from os import makedirs
|
||||||
from os.path import expanduser
|
from os.path import expanduser
|
||||||
from shutil import copyfile, rmtree
|
from shutil import copyfile, rmtree
|
||||||
from sys import argv
|
from sys import argv
|
||||||
|
|
||||||
from config import config
|
from config import config
|
||||||
from converter import get_unknown_chars, unknown_chars
|
from converters import unknown_chars, unknown_chars_context
|
||||||
from database import db
|
from database import DB
|
||||||
from items import (
|
from spipobjects import (
|
||||||
Article,
|
Article,
|
||||||
Document,
|
Document,
|
||||||
Section,
|
Rubrique,
|
||||||
Sections,
|
get_articles,
|
||||||
|
get_documents,
|
||||||
|
get_sections,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -64,26 +65,27 @@ def indent(nb: int = 1) -> None:
|
|||||||
|
|
||||||
|
|
||||||
# Connect to the MySQL database with Peewee ORM
|
# Connect to the MySQL database with Peewee ORM
|
||||||
db.init(config.db, host=config.db_host, user=config.db_user, password=config.db_pass)
|
DB.init(config.db, host=config.db_host, user=config.db_user, password=config.db_pass)
|
||||||
db.connect()
|
DB.connect()
|
||||||
|
|
||||||
|
|
||||||
# Output information about ongoing export & write section to output destination
|
# Output information about ongoing export & write section to output destination
|
||||||
def write_section(index: int, total: int, section: Section) -> str:
|
def write_section(index: int, total: int, section: Rubrique) -> str:
|
||||||
|
color = G # Associate sections to green
|
||||||
# Print the name of the exported section & number of remaining sections
|
# Print the name of the exported section & number of remaining sections
|
||||||
style(f"{index + 1}. ", BO)
|
style(f"{index + 1}. ", BO)
|
||||||
highlight(section.title, *unknown_chars(section.title))
|
highlight(section.titre, *unknown_chars(section.titre))
|
||||||
style(f" {total-index-1}", BO, G)
|
style(f" {total-index-1}", BO, color)
|
||||||
style(f" section{s(total-index)} left")
|
style(f" section{s(total-index)} left")
|
||||||
# Define the section’s path (directory) & create directory(ies) if needed
|
# Define the section’s path (directory) & create directory(ies) if needed
|
||||||
sectiondir: str = config.output_dir + "/" + section.get_slug()
|
sectiondir: str = config.output_dir + "/" + section.slug()
|
||||||
makedirs(sectiondir, exist_ok=True)
|
makedirs(sectiondir, exist_ok=True)
|
||||||
# Define the section filename & write the index at that filename
|
# Define the section filename & write the index at that filename
|
||||||
sectionpath: str = sectiondir + "/" + section.get_filename()
|
sectionpath: str = sectiondir + "/" + section.filename()
|
||||||
with open(sectionpath, "w") as f:
|
with open(sectionpath, "w") as f:
|
||||||
f.write(section.get_content())
|
f.write(section.content())
|
||||||
# Print export location when finished exporting
|
# Print export location when finished exporting
|
||||||
style(" -> ", BO, G)
|
style(" -> ", BO, color)
|
||||||
print(sectionpath)
|
print(sectionpath)
|
||||||
# Return the first "limit" articles of section
|
# Return the first "limit" articles of section
|
||||||
return sectiondir
|
return sectiondir
|
||||||
@ -91,30 +93,31 @@ def write_section(index: int, total: int, section: Section) -> str:
|
|||||||
|
|
||||||
# Output information about ongoing export & write article to output destination
|
# Output information about ongoing export & write article to output destination
|
||||||
def write_article(index: int, total: int, article: Article, sectiondir: str) -> str:
|
def write_article(index: int, total: int, article: Article, sectiondir: str) -> str:
|
||||||
|
color = Y # Associate articles to yellow
|
||||||
# Print the remaining number of articles to export every 100 articles
|
# Print the remaining number of articles to export every 100 articles
|
||||||
if index % 100 == 0:
|
if index % 100 == 0:
|
||||||
indent()
|
indent()
|
||||||
print("Exporting", end="")
|
print("Exporting", end="")
|
||||||
style(f" {total-index}", BO, Y)
|
style(f" {total-index}", BO, color)
|
||||||
print(" SPIP", end="")
|
print(" SPIP", end="")
|
||||||
style(f" article{s(total-index)}")
|
style(f" article{s(total-index)}")
|
||||||
print(" to Markdown & YAML files")
|
print(" to Markdown & YAML files")
|
||||||
# Print the title of the article being exported
|
# Print the title of the article being exported
|
||||||
style(
|
style(
|
||||||
f" {index + 1}. "
|
f" {index + 1}. "
|
||||||
+ ("EMPTY " if len(article.text) < 1 else "")
|
+ ("EMPTY " if len(article.texte) < 1 else "")
|
||||||
+ f"{article.lang} "
|
+ f"{article.lang} "
|
||||||
)
|
)
|
||||||
highlight(article.title, *unknown_chars(article.title))
|
highlight(article.titre, *unknown_chars(article.titre))
|
||||||
# Define the full article path & create directory(ies) if needed
|
# Define the full article path & create directory(ies) if needed
|
||||||
articledir: str = sectiondir + "/" + article.get_slug()
|
articledir: str = sectiondir + "/" + article.slug()
|
||||||
makedirs(articledir, exist_ok=True)
|
makedirs(articledir, exist_ok=True)
|
||||||
# Define the article filename & write the article at the filename
|
# Define the article filename & write the article at the filename
|
||||||
articlepath: str = articledir + "/" + article.get_filename()
|
articlepath: str = articledir + "/" + article.filename()
|
||||||
with open(articlepath, "w") as f:
|
with open(articlepath, "w") as f:
|
||||||
f.write(article.get_content())
|
f.write(article.content())
|
||||||
# Print export location when finished exporting
|
# Print export location when finished exporting
|
||||||
style(" -> ", BO, B)
|
style(" -> ", BO, color)
|
||||||
print(articlepath)
|
print(articlepath)
|
||||||
return articledir
|
return articledir
|
||||||
|
|
||||||
@ -123,34 +126,35 @@ def write_article(index: int, total: int, article: Article, sectiondir: str) ->
|
|||||||
def write_document(
|
def write_document(
|
||||||
index: int, total: int, document: Document, objectdir: str, indent_depth: int = 1
|
index: int, total: int, document: Document, objectdir: str, indent_depth: int = 1
|
||||||
) -> None:
|
) -> None:
|
||||||
|
color = B # Associate documents to blue
|
||||||
if index % 100 == 0:
|
if index % 100 == 0:
|
||||||
indent(indent_depth)
|
indent(indent_depth)
|
||||||
print("Exporting", end="")
|
print("Exporting", end="")
|
||||||
style(f" {total-index}", BO, B)
|
style(f" {total-index}", BO, color)
|
||||||
style(f" document{s(total-index)}\n")
|
style(f" document{s(total-index)}\n")
|
||||||
# Print the name of the file with a counter
|
# Print the name of the file with a counter
|
||||||
indent(indent_depth)
|
indent(indent_depth)
|
||||||
style(f"{index + 1}. {document.media} ")
|
style(f"{index + 1}. {document.media} ")
|
||||||
if len(document.title) > 0:
|
if len(document.titre) > 0:
|
||||||
highlight(document.title + " ", *unknown_chars(document.title))
|
highlight(document.titre + " ", *unknown_chars(document.titre))
|
||||||
style("at ")
|
style("at ")
|
||||||
print(document.file, end="")
|
print(document.fichier, end="")
|
||||||
# Define document path
|
# Define document path
|
||||||
documentpath: str = expanduser(config.data_dir + "/" + document.file)
|
documentpath: str = expanduser(config.data_dir + "/" + document.fichier)
|
||||||
# Copy the document from it’s SPIP location to the new location
|
# Copy the document from it’s SPIP location to the new location
|
||||||
try:
|
try:
|
||||||
copyfile(documentpath, objectdir + "/" + document.get_slug())
|
copyfile(documentpath, objectdir + "/" + document.slug())
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
style(" -> NOT FOUND!\n", BO, R)
|
style(" -> NOT FOUND!\n", BO, R)
|
||||||
else:
|
else:
|
||||||
# Print the outputted file’s path when copied the file
|
# Print the outputted file’s path when copied the file
|
||||||
style(" ->", BO, B)
|
style(" ->", BO, color)
|
||||||
print(f" {objectdir}/{document.get_slug()}")
|
print(f" {objectdir}/{document.slug()}")
|
||||||
|
|
||||||
|
|
||||||
# Return true if an article field contains an unknown character
|
# Return true if an article field contains an unknown character
|
||||||
def has_unknown_chars(article: Article) -> bool:
|
def has_unknown_chars(article: Article) -> bool:
|
||||||
if len(get_unknown_chars(article.text)) > 0:
|
if len(unknown_chars_context(article.texte)) > 0:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -159,13 +163,13 @@ def has_unknown_chars(article: Article) -> bool:
|
|||||||
def warn_unknown_chars(article: Article) -> None:
|
def warn_unknown_chars(article: Article) -> None:
|
||||||
# Print the title of the article in which there is unknown characters
|
# Print the title of the article in which there is unknown characters
|
||||||
# & the number of them
|
# & the number of them
|
||||||
unknown_chars_apparitions: list[str] = get_unknown_chars(article.text)
|
unknown_chars_apparitions: list[str] = unknown_chars_context(article.texte)
|
||||||
nb: int = len(unknown_chars_apparitions)
|
nb: int = len(unknown_chars_apparitions)
|
||||||
s: str = "s" if nb > 1 else ""
|
s: str = "s" if nb > 1 else ""
|
||||||
style(f"{nb}")
|
style(f"{nb}")
|
||||||
print(f" unknown character{s} in", end="")
|
print(f" unknown character{s} in", end="")
|
||||||
style(f" {article.lang} ")
|
style(f" {article.lang} ")
|
||||||
highlight(article.title, *unknown_chars(article.title))
|
highlight(article.titre, *unknown_chars(article.titre))
|
||||||
print() # Break line
|
print() # Break line
|
||||||
# Print the context in which the unknown characters are found
|
# Print the context in which the unknown characters are found
|
||||||
for text in unknown_chars_apparitions:
|
for text in unknown_chars_apparitions:
|
||||||
@ -197,7 +201,7 @@ if __name__ == "__main__":
|
|||||||
unknown_chars_articles: list[Article] = []
|
unknown_chars_articles: list[Article] = []
|
||||||
|
|
||||||
# Get sections with an eventual maximum
|
# Get sections with an eventual maximum
|
||||||
sections = Sections(max_sections_export)
|
sections = get_sections(max_sections_export)
|
||||||
nb_sections_export: int = len(sections)
|
nb_sections_export: int = len(sections)
|
||||||
|
|
||||||
# Loop among sections & export them
|
# Loop among sections & export them
|
||||||
@ -205,11 +209,11 @@ if __name__ == "__main__":
|
|||||||
# Write the section & store its articles
|
# Write the section & store its articles
|
||||||
sectiondir = write_section(i, nb_sections_export, section)
|
sectiondir = write_section(i, nb_sections_export, section)
|
||||||
# Loop over section’s related files (images …)
|
# Loop over section’s related files (images …)
|
||||||
documents = section.get_documents()
|
documents = get_documents(section.id_rubrique)
|
||||||
for i, document in enumerate(documents):
|
for i, document in enumerate(documents):
|
||||||
write_document(i, len(documents), document, sectiondir)
|
write_document(i, len(documents), document, sectiondir)
|
||||||
# Loop over section’s articles
|
# Loop over section’s articles
|
||||||
articles = section.get_articles(max_articles_export)
|
articles = get_articles(section.id_rubrique, (max_articles_export))
|
||||||
for i, article in enumerate(articles):
|
for i, article in enumerate(articles):
|
||||||
articledir = write_article(i, len(articles), article, sectiondir)
|
articledir = write_article(i, len(articles), article, sectiondir)
|
||||||
# Add article to unknown_chars_articles if needed
|
# Add article to unknown_chars_articles if needed
|
||||||
@ -218,7 +222,7 @@ if __name__ == "__main__":
|
|||||||
# Decrement export limit
|
# Decrement export limit
|
||||||
max_articles_export -= 1
|
max_articles_export -= 1
|
||||||
# Loop over article’s related files (images …)
|
# Loop over article’s related files (images …)
|
||||||
documents = section.get_documents()
|
documents = get_documents(article.id_article)
|
||||||
for i, document in enumerate(documents):
|
for i, document in enumerate(documents):
|
||||||
write_document(i, len(documents), document, sectiondir, 2)
|
write_document(i, len(documents), document, sectiondir, 2)
|
||||||
# Break line when finished exporting the section
|
# Break line when finished exporting the section
|
||||||
@ -229,4 +233,4 @@ if __name__ == "__main__":
|
|||||||
for article in unknown_chars_articles:
|
for article in unknown_chars_articles:
|
||||||
warn_unknown_chars(article)
|
warn_unknown_chars(article)
|
||||||
|
|
||||||
db.close() # Close the connection with the database
|
DB.close() # Close the connection with the database
|
||||||
|
212
spip2md/spipobjects.py
Normal file
212
spip2md/spipobjects.py
Normal file
@ -0,0 +1,212 @@
|
|||||||
|
from os.path import basename, splitext
|
||||||
|
|
||||||
|
from peewee import ModelSelect
|
||||||
|
from slugify import slugify
|
||||||
|
from yaml import dump
|
||||||
|
|
||||||
|
from converters import convert
|
||||||
|
from database import (
|
||||||
|
SpipArticles,
|
||||||
|
SpipAuteurs,
|
||||||
|
SpipAuteursLiens,
|
||||||
|
SpipDocuments,
|
||||||
|
SpipDocumentsLiens,
|
||||||
|
SpipRubriques,
|
||||||
|
)
|
||||||
|
|
||||||
|
EXPORTTYPE: str = "md"
|
||||||
|
|
||||||
|
# Convert images & files links
|
||||||
|
# text: str = convert_documents(
|
||||||
|
# self.texte,
|
||||||
|
# [(d.id, d.titre, d.slug()) for d in self.documents()],
|
||||||
|
# )
|
||||||
|
|
||||||
|
|
||||||
|
class Document(SpipDocuments):
|
||||||
|
class Meta:
|
||||||
|
table_name: str = "spip_documents"
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.titre: str = convert(self.titre, True)
|
||||||
|
self.descriptif: str = convert(self.descriptif, True)
|
||||||
|
self.statut: str = "false" if self.statut == "publie" else "true"
|
||||||
|
|
||||||
|
def slug(self, date: bool = False) -> str:
|
||||||
|
name_type: tuple[str, str] = splitext(basename(self.fichier))
|
||||||
|
return (
|
||||||
|
slugify((self.date_publication + "-" if date else "") + name_type[0])
|
||||||
|
+ name_type[1]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Article(SpipArticles):
|
||||||
|
class Meta:
|
||||||
|
table_name: str = "spip_articles"
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.titre: str = convert(self.titre, True)
|
||||||
|
self.descriptif: str = convert(self.descriptif, True)
|
||||||
|
self.texte: str = convert(self.texte) # Convert SPIP to Markdown
|
||||||
|
self.statut: str = "false" if self.statut == "publie" else "true"
|
||||||
|
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
|
||||||
|
self.extra: str = convert(self.extra) # Probably unused
|
||||||
|
# Article specific
|
||||||
|
self.surtitle: str = convert(self.surtitre, True) # Probably unused
|
||||||
|
self.subtitle: str = convert(self.soustitre, True) # Probably unused
|
||||||
|
self.caption: str = convert(self.chapo) # Probably unused
|
||||||
|
self.ps: str = convert(self.ps) # Probably unused
|
||||||
|
self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
|
||||||
|
|
||||||
|
def slug(self, date: bool = False) -> str:
|
||||||
|
return slugify((self.date + "-" if date else "") + self.titre)
|
||||||
|
|
||||||
|
def filename(self) -> str:
|
||||||
|
return "index" + "." + self.lang + "." + EXPORTTYPE
|
||||||
|
|
||||||
|
def frontmatter(self) -> str:
|
||||||
|
return dump(
|
||||||
|
{
|
||||||
|
"lang": self.lang,
|
||||||
|
"translationKey": self.id_trad,
|
||||||
|
"title": self.titre,
|
||||||
|
"publishDate": self.date,
|
||||||
|
"lastmod": self.maj,
|
||||||
|
"draft": self.statut,
|
||||||
|
"description": self.descriptif,
|
||||||
|
# Debugging
|
||||||
|
"spip_id": self.id_article,
|
||||||
|
"spip_id_secteur": self.id_secteur,
|
||||||
|
# Article specific
|
||||||
|
"surtitle": self.surtitle,
|
||||||
|
"subtitle": self.subtitle,
|
||||||
|
"date": self.date_redac,
|
||||||
|
"authors": [author.nom for author in self.authors()],
|
||||||
|
# Debugging
|
||||||
|
"spip_id_rubrique": self.id_rubrique,
|
||||||
|
"spip_chapo": self.caption,
|
||||||
|
},
|
||||||
|
allow_unicode=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
def body(self) -> str:
|
||||||
|
body: str = ""
|
||||||
|
# Add the title as a Markdown h1
|
||||||
|
if len(self.titre) > 0:
|
||||||
|
body += "\n\n# " + self.titre
|
||||||
|
# If there is a text, add the text preceded by two line breaks
|
||||||
|
if len(self.texte) > 0:
|
||||||
|
# Remove remaining HTML after & append to body
|
||||||
|
body += "\n\n"
|
||||||
|
# Same with an "extra" section
|
||||||
|
if len(self.extra) > 0:
|
||||||
|
body += "\n\n# EXTRA\n\n" + self.extra
|
||||||
|
# If there is a caption, add the caption followed by a hr
|
||||||
|
if hasattr(self, "caption") and len(self.caption) > 0:
|
||||||
|
body += "\n\n" + self.caption + "\n\n***"
|
||||||
|
# PS
|
||||||
|
if hasattr(self, "ps") and len(self.ps) > 0:
|
||||||
|
body += "\n\n# POST-SCRIPTUM\n\n" + self.ps
|
||||||
|
# Microblog
|
||||||
|
if hasattr(self, "microblog") and len(self.microblog) > 0:
|
||||||
|
body += "\n\n# MICROBLOGGING\n\n" + self.microblog
|
||||||
|
return body
|
||||||
|
|
||||||
|
def content(self) -> str:
|
||||||
|
# Return the final article text
|
||||||
|
return "---\n" + self.frontmatter() + "---" + self.body()
|
||||||
|
|
||||||
|
def authors(self) -> list[SpipAuteurs]:
|
||||||
|
return (
|
||||||
|
SpipAuteurs.select()
|
||||||
|
.join(
|
||||||
|
SpipAuteursLiens,
|
||||||
|
on=(SpipAuteurs.id_auteur == SpipAuteursLiens.id_auteur),
|
||||||
|
)
|
||||||
|
.where(SpipAuteursLiens.id_objet == self.id_article)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Rubrique(SpipRubriques):
|
||||||
|
class Meta:
|
||||||
|
table_name: str = "spip_rubriques"
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.titre: str = convert(self.titre, True)
|
||||||
|
self.descriptif: str = convert(self.descriptif, True)
|
||||||
|
self.texte: str = convert(self.texte) # Convert SPIP to Markdown
|
||||||
|
self.statut: str = "false" if self.statut == "publie" else "true"
|
||||||
|
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
|
||||||
|
self.extra: str = convert(self.extra) # Probably unused
|
||||||
|
|
||||||
|
def slug(self, date: bool = False) -> str:
|
||||||
|
return slugify((self.date + "-" if date else "") + self.titre)
|
||||||
|
|
||||||
|
def filename(self) -> str:
|
||||||
|
return "index" + "." + self.lang + "." + EXPORTTYPE
|
||||||
|
|
||||||
|
def frontmatter(self) -> str:
|
||||||
|
return dump(
|
||||||
|
{
|
||||||
|
"lang": self.lang,
|
||||||
|
"translationKey": self.id_trad,
|
||||||
|
"title": self.titre,
|
||||||
|
"publishDate": self.date,
|
||||||
|
"lastmod": self.maj,
|
||||||
|
"draft": self.statut,
|
||||||
|
"description": self.descriptif,
|
||||||
|
# Debugging
|
||||||
|
"spip_id": self.id_rubrique,
|
||||||
|
"spip_id_secteur": self.id_secteur,
|
||||||
|
},
|
||||||
|
allow_unicode=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
def body(self) -> str:
|
||||||
|
body: str = ""
|
||||||
|
# Add the title as a Markdown h1
|
||||||
|
if len(self.titre) > 0:
|
||||||
|
body += "\n\n# " + self.titre
|
||||||
|
# If there is a text, add the text preceded by two line breaks
|
||||||
|
if len(self.texte) > 0:
|
||||||
|
# Remove remaining HTML after & append to body
|
||||||
|
body += "\n\n"
|
||||||
|
# Same with an "extra" section
|
||||||
|
if len(self.extra) > 0:
|
||||||
|
body += "\n\n# EXTRA\n\n" + self.extra
|
||||||
|
return body
|
||||||
|
|
||||||
|
def content(self) -> str:
|
||||||
|
# Return the final article text
|
||||||
|
return "---\n" + self.frontmatter() + "---" + self.body()
|
||||||
|
|
||||||
|
|
||||||
|
# Query the DB to retrieve all sections sorted by publication date
|
||||||
|
def get_sections(limit: int = 10**6) -> ModelSelect:
|
||||||
|
return Rubrique.select().order_by(Rubrique.date.desc()).limit(limit)
|
||||||
|
|
||||||
|
|
||||||
|
# Query the DB to retrieve all articles sorted by publication date
|
||||||
|
def get_articles(section_id: int, limit: int = 10**6) -> ModelSelect:
|
||||||
|
return (
|
||||||
|
Article.select()
|
||||||
|
.where(Article.id_rubrique == section_id)
|
||||||
|
.order_by(Article.date.desc())
|
||||||
|
.limit(limit)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Query the DB to retrieve all documents related to object of id object_id
|
||||||
|
def get_documents(object_id: int, limit: int = 10**6) -> ModelSelect:
|
||||||
|
return (
|
||||||
|
Document.select()
|
||||||
|
.join(
|
||||||
|
SpipDocumentsLiens,
|
||||||
|
on=(Document.id_document == SpipDocumentsLiens.id_document),
|
||||||
|
)
|
||||||
|
.where(SpipDocumentsLiens.id_objet == object_id)
|
||||||
|
.limit(limit)
|
||||||
|
)
|
Loading…
Reference in New Issue
Block a user