improved architecture, started handling <multi> blocks

2023-05-26 16:39:48 +02:00 · 2023-05-26 16:39:48 +02:00 · 952595b34c
commit 952595b34c
parent 32738a9269
3 changed files with 122 additions and 52 deletions
--- a/spip2md/init.py
+++ b/spip2md/init.py
@ -9,7 +9,7 @@ from peewee import ModelSelect
 from spip2md.config import CFG
 from spip2md.database import DB
-from spip2md.spipobjects import Rubrique
+from spip2md.spipobjects import RootRubrique, Rubrique
 # Define styles
 BOLD = 1  # Bold
@ -56,6 +56,7 @@ def root_sections(limit: int = 10**3) -> ModelSelect:
        .limit(limit)
    )
 r"""
 # Print the detected unknown chars in article in their context but highlighted
 def warn_unknown_chars(article: Article) -> None:
@ -80,12 +81,22 @@ def warn_unknown_chars(article: Article) -> None:
 # Print one root section list output correctly
 # sys.setrecursionlimit(2000)
-def print_output(tree: list[Any], depth: int = 0, indent: str = "  ") -> None:
+def print_output(
    tree: list[Any],
    indent: str = "  ",
    depth: int = 0,
    branches: int = 1,
    leaves: int = 0,
 ) -> tuple[int, int]:
    for sub in tree:
        if type(sub) == list:
-            print_output(sub, depth + 1)
+            branches, leaves = print_output(
                sub, indent, depth + 1, branches + 1, leaves
            )
        else:
            leaves += 1
            print(indent * depth + sub)
    return (branches, leaves)
 # Connect to the MySQL database with Peewee ORM
@ -95,32 +106,28 @@ DB.connect()
 # Main loop to execute only if script is directly executed
 def main(*argv):
    # Allow main to get args when directly executed
    if len(argv) == 0:
        argv = sys.argv
-    # Define max nb of sections to export based on first CLI argument TODO
+
-    if len(argv) >= 2:
+    # TODO Define max nb of sections to export based on first CLI argument
-        sections_export = int(argv[1])
+    # if len(argv) >= 2:
-    else:
+    #     sections_export = int(argv[1])
        sections_export = CFG.max_sections_export
    # Define max nb of articles to export based on second CLI argument TODO
    # if len(argv) >= 3:
    #     articles_export = int(argv[2])
    # else:
-    #     articles_export = CFG.max_articles_export
+    #     sections_export = CFG.max_sections_export
    # Clear the output dir & create a new
    if CFG.clear_output:
        rmtree(CFG.output_dir, True)
    makedirs(CFG.output_dir, exist_ok=True)
-    # Get the first max_sections_export root sections
+    # Get the virtual id=0 section
-    sections: ModelSelect = root_sections(sections_export)
+    root: Rubrique = RootRubrique()
    total: int = len(sections)
-    # Write each root sections with its subtree
+    # Write everything & print the output human-readably
-    for i, section in enumerate(sections):
+    sections, articles = print_output(root.write_tree(CFG.output_dir))
-        print_output(section.write_tree(CFG.output_dir, i, total))
+    # End, summary message
-        print()  # Break line after exporting the section
+    print(f"Exported a total of {sections} sections, containing {articles} articles")
    # print()  # Break line between export & unknown characters warning
    # Warn about each article that contains unknown(s) character(s)
--- a/spip2md/regexmap.py
+++ b/spip2md/regexmap.py
@ -112,11 +112,9 @@ SPIP_MARKDOWN = (
 DOCUMENT_LINK = r"(!)?\[(.*?)\]\(({})\)"
 DOCUMENT_LINK_REPL = r"\1[\2{}]({})"
-# Multi language block, capture groups: (lang, text, lang, text, …)
+# Multi language block, to be further processed per lang
-MULTILANG = compile(
+MULTILANG_BLOCK = compile(r"<multi>(.+?)<\/multi>", S | I)
-    r"<multi>(?:\s*\[(.{2,6})\]\s*(.*?)\s*)+<\/multi>",
+MULTILANGS = compile(r"\[([a-zA-Z\-]{2,6})\]\s*(.+?)(?=\[[a-zA-Z\-]{2,6}\]|$)", S | I)
    S | I,
 )
 # WARNING probably useless text in metadata fields, to be removed
 BLOAT = (
--- a/spip2md/spipobjects.py
+++ b/spip2md/spipobjects.py
@ -3,7 +3,7 @@ from os import makedirs
 from os.path import basename, splitext
 from re import finditer, sub
 from shutil import copyfile
-from typing import Any, Optional
+from typing import Any, Match, Optional
 from peewee import BigAutoField, DateTimeField, ModelSelect
 from slugify import slugify
@ -24,7 +24,8 @@ from spip2md.regexmap import (
    DOCUMENT_LINK_REPL,
    HTMLTAG,
    ISO_UTF,
-    MULTILANG,
+    MULTILANG_BLOCK,
    MULTILANGS,
    SPIP_MARKDOWN,
    UNKNOWN_ISO,
 )
@ -35,8 +36,47 @@ class SpipWritable:
    texte: str
    lang: str
    titre: str
    descriptif: str
    profondeur: int
    # Returns the first detected language (& instantiate a new object for the second)
    # (currently don’t instantiate, just warns)
    def translate(self, text: str) -> str:
        def replace_lang(match: Match[str]) -> str:
            first_lang: str = match.group(1)
            # The first group is the inside of <multi></multi> blocks
            for i, lang in enumerate(MULTILANGS.finditer(match.group(1))):
                if i == 0:
                    # Redefine this lang to the first one WARNING
                    self.lang = lang.group(1)
                    # Outputs the first lang associated text
                    first_lang = lang.group(2)
                else:
                    pass
                    # print("Found other language for", first_lang, ":", lang.groups())
            return first_lang
        return MULTILANG_BLOCK.sub(replace_lang, text)
    # Apply different mappings to a text field, like SPIP to Markdown or encoding
    def convert(self, text: Optional[str]) -> str:
        if text is not None and len(text) > 0:
            for spip, markdown in SPIP_MARKDOWN:
                text = spip.sub(markdown, text)
            for bloat in BLOAT:
                text = bloat.sub("", text)
            for iso, utf in ISO_UTF:
                text = text.replace(iso, utf)
            text = self.translate(text)
        else:
            return ""
        return text
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.titre = self.convert(self.titre)
        self.descriptif = self.convert(self.descriptif)
    def filename(self, date: bool = False) -> str:
        raise NotImplementedError(
            f"Subclasses need to implement filename(), date: {date}"
@ -59,19 +99,6 @@ class SpipWritable:
            output[-1] += "MISSING NAME"
        return output
    # Apply different mappings to text fields, like SPIP to Markdown or encoding
    def convert_attrs(self, *attrs: str) -> None:
        attrs += "titre", "descriptif"
        for attr in attrs:
            a = getattr(self, attr)
            if len(a) > 0:
                for spip, markdown in SPIP_MARKDOWN:
                    setattr(self, attr, spip.sub(markdown, a))
                for bloat in BLOAT:
                    setattr(self, attr, bloat.sub("", a))
                for iso, utf in ISO_UTF:
                    setattr(self, attr, a.replace(iso, utf))
    # Write object to output destination
    def write(self, parent_dir: str) -> str:
        raise NotImplementedError(
@ -107,8 +134,6 @@ class Document(SpipWritable, SpipDocuments):
    # Write document to output destination
    def write(self, parent_dir: str) -> str:
        # Apply needed conversions
        super().convert_attrs()
        # Define file source and destination
        src: str = CFG.data_dir + self.fichier
        dest: str = parent_dir + self.filename()
@ -129,6 +154,7 @@ class SpipObject(SpipWritable):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Common fields that need conversions
        self.extra: str = self.convert(self.extra)
        self.statut: str = "false" if self.statut == "publie" else "true"
        self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
        # Define file prefix (needs to be redefined for sections)
@ -223,13 +249,21 @@ class SpipObject(SpipWritable):
            body += "\n\n# EXTRA\n\n" + self.extra
        return body
-    def convert_attrs(self, *attrs: str) -> None:
+    # Clean remaining HTML tags in attrs
-        return super().convert_attrs(*attrs, "descriptif", "extra")
+    def clean_html(self, *attrs: str) -> None:
        attrs += "titre", "texte", "descriptif", "extra"
        for attr in attrs:
            a = getattr(self, attr)
            if len(a) > 0:
                setattr(self, attr, HTMLTAG.sub("", a))
    # Write object to output destination
-    def write(self, parent_dir: str) -> str:
+    def write(self, parent_dir: str, clean_html: bool = True) -> str:
-        # Apply needed conversions
+        # Link articles
-        super().convert_attrs()
+        self.link_articles()
        # Delete remaining HTML tags WARNING
        if clean_html:
            self.clean_html()
        # Define actual export directory
        directory: str = parent_dir + self.dir_slug()
        # Make a directory for this object if there isn’t
@ -249,15 +283,14 @@ class Article(SpipObject, SpipArticles):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # More conversions needed for articles
        self.surtitre: str = self.convert(self.surtitre)
        self.soustitre: str = self.convert(self.soustitre)
        self.chapo: str = self.convert(self.chapo)
        self.ps: str = self.convert(self.ps)
        self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
        # ID
        self.object_id = self.id_article
    def convert_attrs(self, *attrs: str) -> None:
        return super().convert_attrs(
            *attrs, "surtitre", "soustitre", "chapo", "ps", "accepter_forum"
        )
    def frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
        meta: dict[str, Any] = {
            # Article specific
@ -331,7 +364,6 @@ class Rubrique(SpipObject, SpipRubriques):
        articles = self.articles()
        documents = self.documents()
        # Write this section
        self.link_articles()
        output[-1] += self.end_message(self.write(parent_dir))
        # Redefine parent_dir for subtree elements
        parent_dir = parent_dir + self.dir_slug()
@ -363,3 +395,36 @@ class Rubrique(SpipObject, SpipRubriques):
        for i, s in enumerate(child_sections):
            output.append(s.write_tree(parent_dir, i, nb))
        return output
 class RootRubrique(Rubrique):
    class Meta:
        table_name: str = "spip_rubriques"
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # 0 ID
        self.id_rubrique = 0
        # self.object_id = 0
    def write_tree(
        self, parent_dir: str, sections_limit: int = 0, articles_limit: int = 0
    ) -> list[str | list[Any]]:
        # Define dictionary output to diplay
        output: list[str | list[Any]] = []
        # Starting message
        output.append(
            f"Begin converting {CFG.db}@{CFG.db_host} db to plain Markdown+YAML files"
        )
        output.append(f" as db user {CFG.db_user}, into the directory {parent_dir}")
        # Get all child section of self
        child_sections = (
            Rubrique.select()
            .where(Rubrique.id_parent == self.id_rubrique)
            .order_by(Rubrique.date.desc())
        )
        nb: int = len(child_sections)
        # Do the same for subsections (write their entire subtree)
        for i, s in enumerate(child_sections):
            output.append(s.write_tree(parent_dir, i, nb))
        return output