fix conversion of links + change unknown chars in a configurable string

2023-05-30 17:48:54 +02:00 · 2023-05-30 17:48:54 +02:00 · fbad1f9563
commit fbad1f9563
parent dd370eb175
4 changed files with 126 additions and 91 deletions
--- a/spip2md/init.py
+++ b/spip2md/init.py
@ -78,4 +78,5 @@ stored into {esc(BOLD)}{branches}{esc()} directories"""
    )

    # Warn about issued warnings in log file
-    print(f"\nThere might be warnings and infos in {esc(BOLD)}{CFG.logfile}{esc()}")
+    if isfile(CFG.logfile):
+        print(f"\nTake a look at warnings and infos in {esc(BOLD)}{CFG.logfile}{esc()}")
--- a/spip2md/config.py
+++ b/spip2md/config.py
@ -27,6 +27,8 @@ class Configuration:
    export_filetype: str = "md"
    logfile: str = "spip2md.log"
    loglevel: str = "WARNING"
+    unknown_char_replacement: str = "??"
+    alternative_languages = ("fr", "en", "es")
    # max_articles_export: int = 1000  # TODO reimplement with recursion
    # max_sections_export: int = 500  # TODO reimplement with recursion

--- a/spip2md/regexmap.py
+++ b/spip2md/regexmap.py
@ -108,32 +108,44 @@ SPIP_MARKDOWN = (
 )

 DOCUMENT_LINK = (
-    (  # SPIP style documents & embeds links
+    (  # SPIP style embeds
        compile(r"<()(?:doc|document|emb|embed)([0-9]+)(?:\|(.*?))?>", S | I),
        r"[{}]({})",
    ),
+    (  # SPIP style documents & embeds links
+        compile(r"\[ *([^\]]*?) *-> *(?:doc|document|emb|embed)([0-9]+) *\]", S | I),
+        r"[{}]({})",
+    ),
    (  # Markdown style documents & embeds links
        compile(r"\[(.*?)\]\((?:doc|document|emb|embed)([0-9]+)(?:\|(.*?))?\)", S | I),
-        r"[\1{}]({})",
+        r"[{}]({})",
    ),
-    (  # SPIP style images links
+    (  # SPIP style images embeds
        compile(r"<()(?:img|image)([0-9]+)(?:\|(.*?))?>", S | I),
        r"![{}]({})",
    ),
+    (  # SPIP style image links
+        compile(r"\[ *([^\]]*?) *-> *(?:img|image)([0-9]+) *\]", S | I),
+        r"[{}]({})",
+    ),
    (  # Markdown style images links
        compile(r"\[(.*?)\]\((?:img|image)([0-9]+)(?:\|(.*?))?\)", S | I),
-        r"![\1{}]({})",
+        r"![{}]({})",
    ),
 )  # Name and path can be further replaced with .format()

 ARTICLE_LINK = (
-    (  # SPIP style documents & embeds links
+    (  # SPIP style article embeds
        compile(r"<()(?:art|article)([0-9]+)(?:\|(.*?))?>", S | I),
        r"[{}]({})",
    ),
+    (  # SPIP style article links
+        compile(r"\[ *([^\]]*?) *-> *(?:art|article)([0-9]+) *\]", S | I),
+        r"[{}]({})",
+    ),
    (  # Markdown style internal links
        compile(r"\[(.*?)\]\((?:art|article)([0-9]+)(?:\|(.*?))?\)", S | I),
-        r"[\1{}]({})",
+        r"[{}]({})",
    ),
 )  # Name and path can be further replaced with .format()

@ -144,7 +156,7 @@ SECTION_LINK = (
    ),
    (  # Markdown style internal links
        compile(r"\[(.*?)\]\((?:rub|rubrique)([0-9]+)(?:\|(.*?))?\)", S | I),
-        r"[\1{}]({})",
+        r"[{}]({})",
    ),
 )  # Name and path can be further replaced with .format()

@ -303,5 +315,6 @@ SPECIAL_OUTPUT = (
 WARNING_OUTPUT = (
    compile(r"(ERROR)"),  # ERROR
    compile(r"(MISSING NAME)"),  # MISSING NAME
+    compile(r"(EMPTY NAME)"),  # EMPTY NAME
    compile(r"(NOT FOUND)"),  # NOT FOUND
 )
--- a/spip2md/spipobjects.py
+++ b/spip2md/spipobjects.py
@ -68,7 +68,11 @@ class SpipWritable:
        return MULTILANG_BLOCK.sub(replace_lang, text)

    # Apply different mappings to a text field, like SPIP to Markdown or encoding
-    def convert(self, text: Optional[str], clean_html: bool = True) -> str:
+    def convert(self, text: str, clean_html: bool = True) -> str:
+        if len(text) == 0:
+            # print("Empty text")
+            return ""
+
        # Return unknown char surrounded by context_length chars
        def unknown_chars_context(text: str, char: str, context_len: int = 24) -> str:
            context: str = r".{0," + str(context_len) + r"}"
@ -81,42 +85,44 @@ class SpipWritable:
            else:
                return char

-        if text is not None and len(text) > 0:
-            print(f"Converting {text[:40]} from {self.titre}")
-            # Convert SPIP syntax to Markdown
-            for spip, markdown in SPIP_MARKDOWN:
-                text = spip.sub(markdown, text)
-            # Remove useless text
-            for bloat in BLOAT:
-                text = bloat.sub("", text)
-            # Convert broken ISO encoding to UTF
-            for iso, utf in ISO_UTF:
-                text = text.replace(iso, utf)
-            # Handle <multi> multi language blocks
-            text = self.translate(text)
-            # Delete remaining HTML tags in body WARNING
-            if clean_html:
-                text = HTMLTAG.sub("", text)
-            # Warn about unknown chars
-            for char in UNKNOWN_ISO:
-                lastend: int = 0
-                for match in finditer("(" + char + ")+", text):
-                    context: str = unknown_chars_context(text[lastend:], char)
+        # Convert SPIP syntax to Markdown
+        for spip, markdown in SPIP_MARKDOWN:
+            text = spip.sub(markdown, text)
+        # Remove useless text
+        for bloat in BLOAT:
+            text = bloat.sub("", text)
+        # Convert broken ISO encoding to UTF
+        for iso, utf in ISO_UTF:
+            text = text.replace(iso, utf)
+        # Handle <multi> multi language blocks
+        text = self.translate(text)
+        # Delete remaining HTML tags in body WARNING
+        if clean_html:
+            text = HTMLTAG.sub("", text)
+        # Warn about unknown chars
+        for char in UNKNOWN_ISO:
+            lastend: int = 0
+            for match in finditer("(" + char + ")+", text):
+                context: str = unknown_chars_context(text[lastend:], char)
+                logging.warn(
+                    f"Unknown char {char} found in {self.titre[:40]} at: {context}"
+                )
+                if CFG.unknown_char_replacement is not None:
                    logging.warn(
-                        f"Unknown char {char} found in {self.titre[:40]} at: {context}"
+                        f"Replacing {match.group()} with {CFG.unknown_char_replacement}"
                    )
-                    lastend = match.end()
-        else:
-            print("Empty or null text")
-            return ""
+                    text = text.replace(match.group(), CFG.unknown_char_replacement, 1)
+                lastend = match.end()
        return text

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
-        print(f"Convert titre from {self.titre}")
-        self.titre: str = self.convert(self.titre)
-        print(f"Convert descriptif from {self.titre}")
-        self.descriptif: str = self.convert(self.descriptif)
+        if self.titre is not None:
+            # print(f"Convert titre from {type(self)} {self.titre}")
+            self.titre: str = self.convert(self.titre)
+        if self.descriptif is not None:
+            # print(f"Convert descriptif from {type(self)} {self.titre}")
+            self.descriptif: str = self.convert(self.descriptif)

    def filename(self, date: bool = False) -> str:
        raise NotImplementedError(
@ -148,10 +154,12 @@ class SpipWritable:
            self.style_print(output[-1])
        # Output the counter & title of the object being exported
        output.append(f"{index + 1}. ")
-        if len(self.titre) > 0:
-            output[-1] += self.titre.strip(" ")
-        else:
+        if self.titre is None:
            output[-1] += "MISSING NAME"
+        elif len(self.titre) == 0:
+            output[-1] += "EMPTY NAME"
+        else:
+            output[-1] += self.titre.strip(" ")
        # Print the output as the program goes
        self.style_print(output[-1], end="")
        return output
@ -213,59 +221,66 @@ class SpipObject(SpipWritable):
    descriptif: str
    extra: str

-    def convert(self, text: Optional[str], clean_html: bool = True) -> str:
+    def convert(self, text: str, clean_html: bool = True) -> str:
+        if len(text) == 0:
+            # print("Empty text")
+            return ""
+
        def found_replace(path_link: str, doc: Any, text: str, match: Match) -> str:
-            repl: str = path_link.format(doc.titre, doc.filename())
-            print(f"Translating link to {repl}")
+            # TODO get relative path
+            if len(match.group(1)) > 0:
+                repl: str = path_link.format(match.group(1), doc.filename())
+            else:
+                repl: str = path_link.format(doc.titre, doc.filename())
+            logging.warn(f"Translating link to {repl}")
            return text.replace(match.group(), repl)

        def not_found_warn(path_link: str, text: str, match: Match) -> str:
            logging.warn(f"No object for link {match.group()} in {self.titre}")
-            return text.replace(match.group(), path_link.format("", "NOT FOUND"))
+            return text.replace(match.group(), path_link.format("", "NOT FOUND"), 1)

-        if text is not None and len(text) > 0:
-            for id_link, path_link in DOCUMENT_LINK:
-                print(f"Looking for links like {id_link}")
-                for match in id_link.finditer(text):
-                    logging.info(f"Found document link {match.group()} in {self.titre}")
-                    try:
-                        doc: Document = Document.get(
-                            Document.id_document == match.group(2)
-                        )
-                        text = found_replace(path_link, doc, text, match)
-                    except DoesNotExist:
-                        text = not_found_warn(path_link, text, match)
-            for id_link, path_link in ARTICLE_LINK:
-                print(f"Looking for links like {id_link}")
-                for match in id_link.finditer(text):
-                    logging.info(f"Found article link {match.group()} in {self.titre}")
-                    try:
-                        art: Article = Article.get(Article.id_article == match.group(2))
-                        text = found_replace(path_link, art, text, match)
-                    except DoesNotExist:
-                        text = not_found_warn(path_link, text, match)
-            for id_link, path_link in SECTION_LINK:
-                print(f"Looking for links like {id_link}")
-                for match in id_link.finditer(text):
-                    logging.info(f"Found section link {match.group()} in {self.titre}")
-                    try:
-                        section: Rubrique = Rubrique.get(
-                            Rubrique.id_rubrique == match.group(2)
-                        )
-                        text = found_replace(path_link, section, text, match)
-                    except DoesNotExist:
-                        text = not_found_warn(path_link, text, match)
-        else:
-            return ""
+        for id_link, path_link in DOCUMENT_LINK:
+            # print(f"Looking for links like {id_link}")
+            for match in id_link.finditer(text):
+                logging.warning(f"Found document link {match.group()} in {self.titre}")
+                try:
+                    doc: Document = Document.get(Document.id_document == match.group(2))
+                    text = found_replace(path_link, doc, text, match)
+                except DoesNotExist:
+                    text = not_found_warn(path_link, text, match)
+        for id_link, path_link in ARTICLE_LINK:
+            # print(f"Looking for links like {id_link}")
+            for match in id_link.finditer(text):
+                logging.info(f"Found article link {match.group()} in {self.titre}")
+                try:
+                    art: Article = Article.get(Article.id_article == match.group(2))
+                    text = found_replace(path_link, art, text, match)
+                except DoesNotExist:
+                    text = not_found_warn(path_link, text, match)
+        for id_link, path_link in SECTION_LINK:
+            # print(f"Looking for links like {id_link}")
+            for match in id_link.finditer(text):
+                logging.info(f"Found section link {match.group()} in {self.titre}")
+                try:
+                    section: Rubrique = Rubrique.get(
+                        Rubrique.id_rubrique == match.group(2)
+                    )
+                    text = found_replace(path_link, section, text, match)
+                except DoesNotExist:
+                    text = not_found_warn(path_link, text, match)
        return super().convert(text, clean_html)

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Common fields that need conversions
-        print(f"Convert texte from {self.titre}")
-        self.texte: str = self.convert(self.texte)
-        print(f"Convert extra from {self.titre}")
-        self.extra: str = self.convert(self.extra)
+        if self.texte is not None:
+            # print(f"Convert texte from {type(self)} {self.titre}")
+            # print(f"First 500 chars: {self.texte[:500]}")
+            self.texte: str = self.convert(self.texte)
+        if self.extra is not None:
+            # print(f"Convert extra from {type(self)} {self.titre}")
+            # print(f"First 500 chars: {self.extra[:500]}")
+            self.extra: str = self.convert(self.extra)
        self.statut: str = "false" if self.statut == "publie" else "true"
        self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
        # Define file prefix (needs to be redefined for sections)
@ -326,14 +341,14 @@ class SpipObject(SpipWritable):
        # Start the content with frontmatter
        body: str = "---\n" + self.frontmatter() + "---"
        # Add the title as a Markdown h1
-        if len(self.titre) > 0 and CFG.prepend_h1:
+        if self.titre is not None and len(self.titre) > 0 and CFG.prepend_h1:
            body += "\n\n# " + self.titre
        # If there is a text, add the text preceded by two line breaks
-        if len(self.texte) > 0:
+        if self.texte is not None and len(self.texte) > 0:
            # Remove remaining HTML after & append to body
            body += "\n\n" + self.texte
        # Same with an "extra" section
-        if len(self.extra) > 0:
+        if self.extra is not None and len(self.extra) > 0:
            body += "\n\n# EXTRA\n\n" + self.extra
        return body

@ -361,10 +376,14 @@ class Article(SpipObject, SpipArticles):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # More conversions needed for articles
-        self.surtitre: str = self.convert(self.surtitre)
-        self.soustitre: str = self.convert(self.soustitre)
-        self.chapo: str = self.convert(self.chapo)
-        self.ps: str = self.convert(self.ps)
+        if self.surtitre is not None:
+            self.surtitre: str = self.convert(self.surtitre)
+        if self.soustitre is not None:
+            self.soustitre: str = self.convert(self.soustitre)
+        if self.chapo is not None:
+            self.chapo: str = self.convert(self.chapo)
+        if self.ps is not None:
+            self.ps: str = self.convert(self.ps)
        self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
        # ID
        self.object_id = self.id_article