diff --git a/spip2md/__init__.py b/spip2md/__init__.py index f67b322..7262275 100644 --- a/spip2md/__init__.py +++ b/spip2md/__init__.py @@ -9,7 +9,7 @@ from peewee import ModelSelect from spip2md.config import CFG from spip2md.database import DB -from spip2md.spipobjects import Rubrique +from spip2md.spipobjects import RootRubrique, Rubrique # Define styles BOLD = 1 # Bold @@ -56,6 +56,7 @@ def root_sections(limit: int = 10**3) -> ModelSelect: .limit(limit) ) + r""" # Print the detected unknown chars in article in their context but highlighted def warn_unknown_chars(article: Article) -> None: @@ -80,12 +81,22 @@ def warn_unknown_chars(article: Article) -> None: # Print one root section list output correctly # sys.setrecursionlimit(2000) -def print_output(tree: list[Any], depth: int = 0, indent: str = " ") -> None: +def print_output( + tree: list[Any], + indent: str = " ", + depth: int = 0, + branches: int = 1, + leaves: int = 0, +) -> tuple[int, int]: for sub in tree: if type(sub) == list: - print_output(sub, depth + 1) + branches, leaves = print_output( + sub, indent, depth + 1, branches + 1, leaves + ) else: + leaves += 1 print(indent * depth + sub) + return (branches, leaves) # Connect to the MySQL database with Peewee ORM @@ -95,32 +106,28 @@ DB.connect() # Main loop to execute only if script is directly executed def main(*argv): + # Allow main to get args when directly executed if len(argv) == 0: argv = sys.argv - # Define max nb of sections to export based on first CLI argument TODO - if len(argv) >= 2: - sections_export = int(argv[1]) - else: - sections_export = CFG.max_sections_export - # Define max nb of articles to export based on second CLI argument TODO - # if len(argv) >= 3: - # articles_export = int(argv[2]) + + # TODO Define max nb of sections to export based on first CLI argument + # if len(argv) >= 2: + # sections_export = int(argv[1]) # else: - # articles_export = CFG.max_articles_export + # sections_export = CFG.max_sections_export # Clear the output dir & create a new if CFG.clear_output: rmtree(CFG.output_dir, True) makedirs(CFG.output_dir, exist_ok=True) - # Get the first max_sections_export root sections - sections: ModelSelect = root_sections(sections_export) - total: int = len(sections) + # Get the virtual id=0 section + root: Rubrique = RootRubrique() - # Write each root sections with its subtree - for i, section in enumerate(sections): - print_output(section.write_tree(CFG.output_dir, i, total)) - print() # Break line after exporting the section + # Write everything & print the output human-readably + sections, articles = print_output(root.write_tree(CFG.output_dir)) + # End, summary message + print(f"Exported a total of {sections} sections, containing {articles} articles") # print() # Break line between export & unknown characters warning # Warn about each article that contains unknown(s) character(s) diff --git a/spip2md/regexmap.py b/spip2md/regexmap.py index 314f0e1..277eb03 100644 --- a/spip2md/regexmap.py +++ b/spip2md/regexmap.py @@ -112,11 +112,9 @@ SPIP_MARKDOWN = ( DOCUMENT_LINK = r"(!)?\[(.*?)\]\(({})\)" DOCUMENT_LINK_REPL = r"\1[\2{}]({})" -# Multi language block, capture groups: (lang, text, lang, text, …) -MULTILANG = compile( - r"(?:\s*\[(.{2,6})\]\s*(.*?)\s*)+<\/multi>", - S | I, -) +# Multi language block, to be further processed per lang +MULTILANG_BLOCK = compile(r"(.+?)<\/multi>", S | I) +MULTILANGS = compile(r"\[([a-zA-Z\-]{2,6})\]\s*(.+?)(?=\[[a-zA-Z\-]{2,6}\]|$)", S | I) # WARNING probably useless text in metadata fields, to be removed BLOAT = ( diff --git a/spip2md/spipobjects.py b/spip2md/spipobjects.py index ee4ffe5..f2f4ef5 100644 --- a/spip2md/spipobjects.py +++ b/spip2md/spipobjects.py @@ -3,7 +3,7 @@ from os import makedirs from os.path import basename, splitext from re import finditer, sub from shutil import copyfile -from typing import Any, Optional +from typing import Any, Match, Optional from peewee import BigAutoField, DateTimeField, ModelSelect from slugify import slugify @@ -24,7 +24,8 @@ from spip2md.regexmap import ( DOCUMENT_LINK_REPL, HTMLTAG, ISO_UTF, - MULTILANG, + MULTILANG_BLOCK, + MULTILANGS, SPIP_MARKDOWN, UNKNOWN_ISO, ) @@ -35,8 +36,47 @@ class SpipWritable: texte: str lang: str titre: str + descriptif: str profondeur: int + # Returns the first detected language (& instantiate a new object for the second) + # (currently don’t instantiate, just warns) + def translate(self, text: str) -> str: + def replace_lang(match: Match[str]) -> str: + first_lang: str = match.group(1) + # The first group is the inside of blocks + for i, lang in enumerate(MULTILANGS.finditer(match.group(1))): + if i == 0: + # Redefine this lang to the first one WARNING + self.lang = lang.group(1) + # Outputs the first lang associated text + first_lang = lang.group(2) + else: + pass + # print("Found other language for", first_lang, ":", lang.groups()) + return first_lang + + return MULTILANG_BLOCK.sub(replace_lang, text) + + # Apply different mappings to a text field, like SPIP to Markdown or encoding + def convert(self, text: Optional[str]) -> str: + if text is not None and len(text) > 0: + for spip, markdown in SPIP_MARKDOWN: + text = spip.sub(markdown, text) + for bloat in BLOAT: + text = bloat.sub("", text) + for iso, utf in ISO_UTF: + text = text.replace(iso, utf) + text = self.translate(text) + else: + return "" + return text + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.titre = self.convert(self.titre) + self.descriptif = self.convert(self.descriptif) + def filename(self, date: bool = False) -> str: raise NotImplementedError( f"Subclasses need to implement filename(), date: {date}" @@ -59,19 +99,6 @@ class SpipWritable: output[-1] += "MISSING NAME" return output - # Apply different mappings to text fields, like SPIP to Markdown or encoding - def convert_attrs(self, *attrs: str) -> None: - attrs += "titre", "descriptif" - for attr in attrs: - a = getattr(self, attr) - if len(a) > 0: - for spip, markdown in SPIP_MARKDOWN: - setattr(self, attr, spip.sub(markdown, a)) - for bloat in BLOAT: - setattr(self, attr, bloat.sub("", a)) - for iso, utf in ISO_UTF: - setattr(self, attr, a.replace(iso, utf)) - # Write object to output destination def write(self, parent_dir: str) -> str: raise NotImplementedError( @@ -107,8 +134,6 @@ class Document(SpipWritable, SpipDocuments): # Write document to output destination def write(self, parent_dir: str) -> str: - # Apply needed conversions - super().convert_attrs() # Define file source and destination src: str = CFG.data_dir + self.fichier dest: str = parent_dir + self.filename() @@ -129,6 +154,7 @@ class SpipObject(SpipWritable): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Common fields that need conversions + self.extra: str = self.convert(self.extra) self.statut: str = "false" if self.statut == "publie" else "true" self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true" # Define file prefix (needs to be redefined for sections) @@ -223,13 +249,21 @@ class SpipObject(SpipWritable): body += "\n\n# EXTRA\n\n" + self.extra return body - def convert_attrs(self, *attrs: str) -> None: - return super().convert_attrs(*attrs, "descriptif", "extra") + # Clean remaining HTML tags in attrs + def clean_html(self, *attrs: str) -> None: + attrs += "titre", "texte", "descriptif", "extra" + for attr in attrs: + a = getattr(self, attr) + if len(a) > 0: + setattr(self, attr, HTMLTAG.sub("", a)) # Write object to output destination - def write(self, parent_dir: str) -> str: - # Apply needed conversions - super().convert_attrs() + def write(self, parent_dir: str, clean_html: bool = True) -> str: + # Link articles + self.link_articles() + # Delete remaining HTML tags WARNING + if clean_html: + self.clean_html() # Define actual export directory directory: str = parent_dir + self.dir_slug() # Make a directory for this object if there isn’t @@ -249,15 +283,14 @@ class Article(SpipObject, SpipArticles): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # More conversions needed for articles + self.surtitre: str = self.convert(self.surtitre) + self.soustitre: str = self.convert(self.soustitre) + self.chapo: str = self.convert(self.chapo) + self.ps: str = self.convert(self.ps) self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false" # ID self.object_id = self.id_article - def convert_attrs(self, *attrs: str) -> None: - return super().convert_attrs( - *attrs, "surtitre", "soustitre", "chapo", "ps", "accepter_forum" - ) - def frontmatter(self, append: Optional[dict[str, Any]] = None) -> str: meta: dict[str, Any] = { # Article specific @@ -331,7 +364,6 @@ class Rubrique(SpipObject, SpipRubriques): articles = self.articles() documents = self.documents() # Write this section - self.link_articles() output[-1] += self.end_message(self.write(parent_dir)) # Redefine parent_dir for subtree elements parent_dir = parent_dir + self.dir_slug() @@ -363,3 +395,36 @@ class Rubrique(SpipObject, SpipRubriques): for i, s in enumerate(child_sections): output.append(s.write_tree(parent_dir, i, nb)) return output + + +class RootRubrique(Rubrique): + class Meta: + table_name: str = "spip_rubriques" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # 0 ID + self.id_rubrique = 0 + # self.object_id = 0 + + def write_tree( + self, parent_dir: str, sections_limit: int = 0, articles_limit: int = 0 + ) -> list[str | list[Any]]: + # Define dictionary output to diplay + output: list[str | list[Any]] = [] + # Starting message + output.append( + f"Begin converting {CFG.db}@{CFG.db_host} db to plain Markdown+YAML files" + ) + output.append(f" as db user {CFG.db_user}, into the directory {parent_dir}") + # Get all child section of self + child_sections = ( + Rubrique.select() + .where(Rubrique.id_parent == self.id_rubrique) + .order_by(Rubrique.date.desc()) + ) + nb: int = len(child_sections) + # Do the same for subsections (write their entire subtree) + for i, s in enumerate(child_sections): + output.append(s.write_tree(parent_dir, i, nb)) + return output