separation between classes, functions & regex mappings

2023-05-26 14:43:39 +02:00 · 2023-05-26 14:43:39 +02:00 · 32738a9269
commit 32738a9269
parent 5e7740a414
3 changed files with 91 additions and 99 deletions
--- a/spip2md/init.py
+++ b/spip2md/init.py
@ -9,11 +9,7 @@ from peewee import ModelSelect
 from spip2md.config import CFG
 from spip2md.database import DB
-from spip2md.regexmap import unknown_chars, unknown_chars_context
+from spip2md.spipobjects import Rubrique
 from spip2md.spipobjects import (
    Article,
    Rubrique,
 )
 # Define styles
 BOLD = 1  # Bold
@ -60,13 +56,7 @@ def root_sections(limit: int = 10**3) -> ModelSelect:
        .limit(limit)
    )
-
+r"""
 def has_unknown_chars(article: Article) -> bool:
    if len(unknown_chars_context(article.texte)) > 0:
        return True
    return False
 # Print the detected unknown chars in article in their context but highlighted
 def warn_unknown_chars(article: Article) -> None:
    # Print the title of the article in which there is unknown characters
@ -85,6 +75,7 @@ def warn_unknown_chars(article: Article) -> None:
        highlight(text, *unknown_chars(text))
        style(" … \n")
    print()  # Break line
 """
 # Print one root section list output correctly
@ -106,16 +97,16 @@ DB.connect()
 def main(*argv):
    if len(argv) == 0:
        argv = sys.argv
-    # Define max nb of articles to export based on first CLI argument
+    # Define max nb of sections to export based on first CLI argument TODO
    if len(argv) >= 2:
-        articles_export = int(argv[1])
+        sections_export = int(argv[1])
    else:
        articles_export = CFG.max_articles_export
    # Define max nb of sections to export based on second CLI argument
    if len(argv) >= 3:
        sections_export = int(argv[2])
    else:
        sections_export = CFG.max_sections_export
    # Define max nb of articles to export based on second CLI argument TODO
    # if len(argv) >= 3:
    #     articles_export = int(argv[2])
    # else:
    #     articles_export = CFG.max_articles_export
    # Clear the output dir & create a new
    if CFG.clear_output:
--- a/spip2md/regexmap.py
+++ b/spip2md/regexmap.py
@ -1,10 +1,9 @@
 # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
 # pyright: strict
-from re import I, S, compile, finditer, sub
+from re import I, S, compile
 from typing import Optional
-# SPIP syntax to Markdown
+# ((SPIP syntax, Replacement Markdown syntax), …)
-SPIP_TO_MARKDOWN = (
+SPIP_MARKDOWN = (
    (  # horizontal rule
        compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I),
        # r"---",
@ -43,15 +42,15 @@ SPIP_TO_MARKDOWN = (
    ),
    (  # images
        compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I),
-        r"![](\1\2)",
+        r"![](\2)",  # Needs to be further processed to replace ID with filename
    ),
    (  # documents & embeds
        compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I),
-        r"[](\1\2)",
+        r"[](\2)",  # Needs to be further processed to replace ID with filename
    ),
    (  # internal links
        compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I),
-        r"[](\1\2)",
+        r"[](\2)",  # Needs to be further processed to replace ID with filename
    ),
    (  # anchor
        compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
@ -106,20 +105,31 @@ SPIP_TO_MARKDOWN = (
        ),
        "```\n\\1\n\n```",
    ),
    (  # WARNING remove every html tag
        compile(r"<\/?.*?>\s*", S | I),
        r"",
    ),
 )
-# Further cleaning for metadata texts such as titles or descriptions
+# Match against documents ID found in links, ID can be inserted with .format()
-SPIP_META_BLOAT = (
+# Name and path can be further replaced with .format()
 DOCUMENT_LINK = r"(!)?\[(.*?)\]\(({})\)"
 DOCUMENT_LINK_REPL = r"\1[\2{}]({})"
 # Multi language block, capture groups: (lang, text, lang, text, …)
 MULTILANG = compile(
    r"<multi>(?:\s*\[(.{2,6})\]\s*(.*?)\s*)+<\/multi>",
    S | I,
 )
 # WARNING probably useless text in metadata fields, to be removed
 BLOAT = (
    compile(r"^>+ +", S | I),  # Remove beginning with angle bracket(s)
    compile(r"^\d+\. +", S | I),  # Remove beginning with a number followed by a dot
 )
-# Broken ISO encoding to proper UTF-8
+# Matches against every HTML tag
-ISO_TO_UTF = (
+HTMLTAG = compile(r"<\/?.*?>\s*", S | I)
 # ((Broken ISO 8859-1 encoding, Proper UTF equivalent encoding), …)
 ISO_UTF = (
    (  # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
        "â€™",
        r"’",
@ -224,7 +234,7 @@ ISO_TO_UTF = (
        "iÌ\u0081",
        r"í",
    ),
-    # WARNING not sure
+    # WARNING not sure below
    (  # Fix UTF-8 é that was interpreted as ISO 8859-1
        "eÌ ",
        r"é",
@ -239,62 +249,22 @@ ISO_TO_UTF = (
    ),
 )
-# WARNING unknown broken encoding
+# WARNING broken ISO 8859-1 encoding which I don’t know the UTF equivalent
 UNKNOWN_ISO = (
-    r"â€¨",
+    "â€¨",
-    r"âˆ†",
+    "âˆ†",
 )
 # Multi language block, capture the first
 MULTILINGUAL = compile(
    r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
    S | I,
 )
-# Apply SPIP to Markdown & ISO to UTF conversions to a text, & eventually clean meta
+# Special elements in terminal output to surround
-def convert(text: Optional[str], clean_meta: bool = False) -> str:
+SPECIAL_OUTPUT = (
-    if text is None:
+    (compile(r"^([0-9]+?\.)(?= )"), r"{}\1{}"),  # Counter
-        return ""
+    (compile(r"(?<= )->(?= )"), r"{}->{}"),  # Arrow
-    for spip, markdown in SPIP_TO_MARKDOWN:
+    (compile(r"(?<=^Exporting )([0-9]+?)(?= )"), r"{}\1{}"),  # Total
-        text = spip.sub(markdown, text)
+)
    if clean_meta:
        for bloat in SPIP_META_BLOAT:
            text = bloat.sub("", text)
    for iso, utf in ISO_TO_UTF:
        text = text.replace(iso, utf)
    return text
 # Replace images & files links in Markdown with real slugs of the actually linked files
 def link_document(text: str, id: int, name: str, slug: str) -> str:
    # Replace images that dont have a title written in text
    text = sub(
        r"!\[]\((?:img|image)" + str(id) + r"(\|.*?)*\)",
        f"![{name}]({slug})",
        text,
    )
    # Replace images that dont have a title written in text
    text = sub(
        r"\[]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)",
        f"[{name}]({slug})",
        text,
    )
    # Replace images that already had a title in Markdown style link
    text = sub(
        r"!\[(.+?)\]\((?:img|image)" + str(id) + r"(\|.*?)*\)",
        f"![\\1]({slug})",
        text,
    )
    # Replace documents that already had a title in Markdown style link
    text = sub(
        r"\[(.+?)\]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)",
        f"[\\1]({slug})",
        text,
    )
    return text
 r"""
 # Return a list of tuples giving the start and end of unknown substring in text
 def unknown_chars(text: str) -> list[tuple[int, int]]:
    positions: list[tuple[int, int]] = []
@ -303,7 +273,6 @@ def unknown_chars(text: str) -> list[tuple[int, int]]:
            positions.append((match.start(), match.end()))
    return positions
 # Return strings with unknown chards found in text, surrounded by context_length chars
 def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
    errors: list[str] = []
@ -316,3 +285,4 @@ def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
        for match in matches:
            errors.append(match.group())
    return errors
 """
--- a/spip2md/spipobjects.py
+++ b/spip2md/spipobjects.py
@ -1,7 +1,7 @@
 # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
 from os import makedirs
 from os.path import basename, splitext
-from re import finditer
+from re import finditer, sub
 from shutil import copyfile
 from typing import Any, Optional
@ -18,7 +18,16 @@ from spip2md.database import (
    SpipDocumentsLiens,
    SpipRubriques,
 )
-from spip2md.regexmap import convert, link_document, unknown_chars
+from spip2md.regexmap import (
    BLOAT,
    DOCUMENT_LINK,
    DOCUMENT_LINK_REPL,
    HTMLTAG,
    ISO_UTF,
    MULTILANG,
    SPIP_MARKDOWN,
    UNKNOWN_ISO,
 )
 class SpipWritable:
@ -26,6 +35,7 @@ class SpipWritable:
    texte: str
    lang: str
    titre: str
    profondeur: int
    def filename(self, date: bool = False) -> str:
        raise NotImplementedError(
@ -49,6 +59,19 @@ class SpipWritable:
            output[-1] += "MISSING NAME"
        return output
    # Apply different mappings to text fields, like SPIP to Markdown or encoding
    def convert_attrs(self, *attrs: str) -> None:
        attrs += "titre", "descriptif"
        for attr in attrs:
            a = getattr(self, attr)
            if len(a) > 0:
                for spip, markdown in SPIP_MARKDOWN:
                    setattr(self, attr, spip.sub(markdown, a))
                for bloat in BLOAT:
                    setattr(self, attr, bloat.sub("", a))
                for iso, utf in ISO_UTF:
                    setattr(self, attr, a.replace(iso, utf))
    # Write object to output destination
    def write(self, parent_dir: str) -> str:
        raise NotImplementedError(
@ -69,8 +92,6 @@ class Document(SpipWritable, SpipDocuments):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.titre: str = convert(self.titre, True)
        self.descriptif: str = convert(self.descriptif, True)
        self.statut: str = "false" if self.statut == "publie" else "true"
    # Get slugified name of this file
@ -86,6 +107,8 @@ class Document(SpipWritable, SpipDocuments):
    # Write document to output destination
    def write(self, parent_dir: str) -> str:
        # Apply needed conversions
        super().convert_attrs()
        # Define file source and destination
        src: str = CFG.data_dir + self.fichier
        dest: str = parent_dir + self.filename()
@ -100,23 +123,25 @@ class SpipObject(SpipWritable):
    date: DateTimeField
    maj: str
    id_secteur: int
    descriptif: str
    extra: str
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Common fields that need conversions
        self.titre: str = convert(self.titre, True)
        self.descriptif: str = convert(self.descriptif, True)
        self.texte: str = convert(self.texte)  # Convert SPIP to Markdown
        self.statut: str = "false" if self.statut == "publie" else "true"
        self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
        self.extra: str = convert(self.extra)  # Probably unused
        # Define file prefix (needs to be redefined for sections)
        self.prefix = "index"
    # Convert SPIP style internal links for images & other files into Markdown style
    def link_documents(self, documents: ModelSelect) -> None:
        for d in documents:
-            self.texte = link_document(self.texte, d.id_document, d.titre, d.filename())
+            self.texte = sub(
                DOCUMENT_LINK.format(d.id_document),
                DOCUMENT_LINK_REPL.format(d.titre, d.filename()),
                self.texte,
            )
    # Output related documents & link them in the text by the way
    def documents(self, link_documents: bool = True) -> ModelSelect:
@ -198,8 +223,13 @@ class SpipObject(SpipWritable):
            body += "\n\n# EXTRA\n\n" + self.extra
        return body
    def convert_attrs(self, *attrs: str) -> None:
        return super().convert_attrs(*attrs, "descriptif", "extra")
    # Write object to output destination
    def write(self, parent_dir: str) -> str:
        # Apply needed conversions
        super().convert_attrs()
        # Define actual export directory
        directory: str = parent_dir + self.dir_slug()
        # Make a directory for this object if there isn’t
@ -219,14 +249,15 @@ class Article(SpipObject, SpipArticles):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # More conversions needed for articles
        self.surtitre: str = convert(self.surtitre, True)  # Probably unused
        self.soustitre: str = convert(self.soustitre, True)  # Probably unused
        self.chapo: str = convert(self.chapo)  # Probably unused
        self.ps: str = convert(self.ps)  # Probably unused
        self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
        # ID
        self.object_id = self.id_article
    def convert_attrs(self, *attrs: str) -> None:
        return super().convert_attrs(
            *attrs, "surtitre", "soustitre", "chapo", "ps", "accepter_forum"
        )
    def frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
        meta: dict[str, Any] = {
            # Article specific