fix document linking

2023-05-30 12:46:17 +02:00 · 2023-05-30 12:46:17 +02:00 · 27c281db90
commit 27c281db90
parent 9c79433f74
2 changed files with 95 additions and 67 deletions
--- a/spip2md/regexmap.py
+++ b/spip2md/regexmap.py
@ -5,17 +5,17 @@ from re import I, S, compile
 # ((SPIP syntax, Replacement Markdown syntax), …)
 SPIP_MARKDOWN = (
    (  # horizontal rule
-        compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I),
+        compile(r"\r?\n?\r?\n?- ?- ?- ?- ?[\- ]*\r?\n?\r?\n?|<hr ?.*?>", I),
        # r"---",
-        r"***",
+        "\n\n***\n\n",
    ),
    (  # line break
-        compile(r"\r?\n_ *(?=\r?\n)|<br ?.*?>", S | I),
+        compile(r"\r?\n_ *(?=\r?\n)|<br ?.*?>", I),
-        "\n",
+        "\n",  # WARNING not the real translation
    ),
    (  # heading
-        compile(r"\{\{\{ *(.*?) *\}\}\}", S | I),
+        compile(r"\r?\n?\r?\n?\{\{\{ *(.*?) *\}\}\}\r?\n?\r?\n?", S | I),
-        r"## \1",  # Translate SPIP headings to h2
+        "\n\n## \\1\n\n",  # Translate SPIP headings to h2
    ),
    (  # strong
        compile(r"\{\{ *(.*?) *\}\} ?", S | I),
@ -40,18 +40,18 @@ SPIP_MARKDOWN = (
        ),
        r"~\1~",
    ),
-    (  # images
+    # (  # images # processed by a specific function
-        compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I),
+    #     compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I),
-        r"![](\2)",  # Needs to be further processed to replace ID with filename
+    #     r"![](\2)",
-    ),
+    # ),
-    (  # documents & embeds
+    # (  # documents & embeds # processed by a specific function
-        compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I),
+    #     compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I),
-        r"[](\2)",  # Needs to be further processed to replace ID with filename
+    #     r"[](\2)",
-    ),
+    # ),
-    (  # internal links
+    # (  # internal links # processed by a specific function
-        compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I),
+    #     compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I),
-        r"[](\2)",  # Needs to be further processed to replace ID with filename
+    #     r"[](\2)",
-    ),
+    # ),
    (  # anchor
        compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
        r"[\1](\2)",
@ -62,7 +62,7 @@ SPIP_MARKDOWN = (
    ),
    (  # footnote
        compile(r"\[\[ *(.*?) *\]\]", S | I),
-        r"",
+        r"",  # WARNING remove it
    ),
    (  # unordered list
        compile(r"(\r?\n)-(?!#|-)\*? *", S | I),
@ -107,10 +107,35 @@ SPIP_MARKDOWN = (
    ),
 )
-# Match against documents ID found in links, ID can be inserted with .format()
+DOCUMENT_LINK = (
-# Name and path can be further replaced with .format()
+    (  # SPIP style documents & embeds links
-DOCUMENT_LINK = r"(!)?\[(.*?)\]\(({})\)"
+        compile(r"<()(?:doc|document|emb|embed)([0-9]+)(?:\|(.*?))?>", S | I),
-DOCUMENT_LINK_REPL = r"\1[\2{}]({})"
+        r"[{}]({})",
    ),
    (  # Markdown style documents & embeds links
        compile(r"\[(.*?)\]\((?:doc|document|emb|embed)([0-9]+)(?:\|(.*?))?\)", S | I),
        r"[\1{}]({})",
    ),
    (  # SPIP style images links
        compile(r"<()(?:img|image)([0-9]+)(?:\|(.*?))?>", S | I),
        r"![{}]({})",
    ),
    (  # Markdown style images links
        compile(r"\[(.*?)\]\((?:img|image)([0-9]+)(?:\|(.*?))?\)", S | I),
        r"![\1{}]({})",
    ),
 )  # Name and path can be further replaced with .format()
 ARTICLE_LINK = (
    (  # SPIP style documents & embeds links
        compile(r"<()(?:art|article)([0-9]+)(?:\|(.*?))?>", S | I),
        r"[{}]({})",
    ),
    (  # Markdown style internal links
        compile(r"\[(.*?)\]\((?:art|article)([0-9]+)(?:\|(.*?))?\)", S | I),
        r"[\1{}]({})",
    ),
 )  # Name and path can be further replaced with .format()
 # Multi language block, to be further processed per lang
 MULTILANG_BLOCK = compile(r"<multi>(.+?)<\/multi>", S | I)
@ -120,12 +145,12 @@ MULTILANGS = compile(
 # WARNING probably useless text in metadata fields, to be removed
 BLOAT = (
-    compile(r"^>+ +", S | I),  # Remove beginning with angle bracket(s)
+    compile(r"^>+ +"),  # Remove beginning with angle bracket(s)
-    compile(r"^\d+\. +", S | I),  # Remove beginning with a number followed by a dot
+    compile(r"^\d+\. +"),  # Remove beginning with a number followed by a dot
 )
 # Matches against every HTML tag
-HTMLTAG = compile(r"<\/?.*?>\s*", S | I)
+HTMLTAG = compile(r"<\/?.*?>\s*", S)
 # ((Broken ISO 8859-1 encoding, Proper UTF equivalent encoding), …)
--- a/spip2md/spipobjects.py
+++ b/spip2md/spipobjects.py
@ -2,7 +2,7 @@
 import logging
 from os import makedirs, remove
 from os.path import basename, splitext
-from re import finditer, search, sub
+from re import finditer, search
 from shutil import copyfile
 from typing import Any, Match, Optional
@ -20,9 +20,9 @@ from spip2md.database import (
    SpipRubriques,
 )
 from spip2md.regexmap import (
    ARTICLE_LINK,
    BLOAT,
    DOCUMENT_LINK,
    DOCUMENT_LINK_REPL,
    HTMLTAG,
    ISO_UTF,
    MULTILANG_BLOCK,
@ -72,14 +72,6 @@ class SpipWritable:
        return MULTILANG_BLOCK.sub(replace_lang, text)
    # Remove remaining HTML tags
    @staticmethod
    def clean_html(string: str) -> str:
        if string is not None and len(string) > 0:
            return HTMLTAG.sub("", string)
        else:
            return ""
    # Apply different mappings to a text field, like SPIP to Markdown or encoding
    def convert(self, text: Optional[str], clean_html: bool = True) -> str:
        # Return unknown char surrounded by context_length chars
@ -108,7 +100,7 @@ class SpipWritable:
            text = self.translate(text)
            # Delete remaining HTML tags in body WARNING
            if clean_html:
-                text = self.clean_html(text)
+                text = HTMLTAG.sub("", text)
            # Warn about unknown chars
            for char in UNKNOWN_ISO:
                lastend: int = 0
@ -222,26 +214,55 @@ class SpipObject(SpipWritable):
    descriptif: str
    extra: str
    def convert(self, text: Optional[str], clean_html: bool = True) -> str:
        if text is not None and len(text) > 0:
            for id_link, path_link in DOCUMENT_LINK:
                for match in id_link.finditer(text):
                    doc: Document = Document.get(Document.id_document == match.group(2))
                    if doc is not None:
                        text = text.replace(
                            match.group(), path_link.format(doc.titre, doc.filename())
                        )
                    else:
                        logging.warn(
                            f"No document for link {match.group()} in {self.titre}"
                        )
                        text = text.replace(
                            match.group(), path_link.format("", "NOT FOUND")
                        )
            for id_link, path_link in ARTICLE_LINK:
                for match in id_link.finditer(text):
                    art: Article = Article.get(Article.id_article == match.group(2))
                    if art is not None:
                        text = text.replace(
                            match.group(),
                            path_link.format(
                                art.titre, f"{art.dir_slug()}/{art.filename()}"
                            ),
                        )
                    else:
                        logging.warn(
                            f"No article for link {match.group()} in {self.titre}"
                        )
                        text = text.replace(
                            match.group(), path_link.format("", "NOT FOUND")
                        )
        else:
            return ""
        return super().convert(text, clean_html)
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Common fields that need conversions
        self.texte = self.convert(self.texte)
        self.extra: str = self.convert(self.extra)
        self.statut: str = "false" if self.statut == "publie" else "true"
        self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
        # Define file prefix (needs to be redefined for sections)
        self.prefix = "index"
-    # Convert SPIP style internal links for images & other files into Markdown style
+    # Get related documents
-    def link_documents(self, documents: ModelSelect) -> None:
+    def documents(self) -> ModelSelect:
        for d in documents:
            self.texte = sub(
                DOCUMENT_LINK.format(d.id_document),
                DOCUMENT_LINK_REPL.format(d.titre, d.filename()),
                self.texte,
            )
    # Output related documents & link them in the text by the way
    def documents(self, link_documents: bool = True) -> ModelSelect:
        documents = (
            Document.select()
            .join(
@ -250,23 +271,9 @@ class SpipObject(SpipWritable):
            )
            .where(SpipDocumentsLiens.id_objet == self.object_id)
        )
        if link_documents:
            self.link_documents(documents)
        return documents
-    # Convert SPIP style internal links for other articles or sections into Markdown
+    # Get related articles
    def link_articles(self) -> None:
        for match in finditer(r"\[(.*?)]\((?:art|article)([0-9]+)\)", self.texte):
            article = Article.get(Article.id_article == match.group(2))
            if len(match.group(1)) > 0:
                title: str = match.group(1)
            else:
                title: str = article.titre
            self.texte = self.texte.replace(
                match.group(0), f"[{title}]({article.dir_slug()}/{article.filename()})"
            )
    # Output related articles
    def articles(self) -> ModelSelect:
        return (
            Article.select()
@ -322,10 +329,6 @@ class SpipObject(SpipWritable):
    # Write object to output destination
    def write(self, parent_dir: str) -> str:
        # Link articles
        self.link_articles()
        # Convert body after linking articles
        self.texte = self.convert(self.texte)
        # Define actual export directory
        directory: str = parent_dir + self.dir_slug()
        # Make a directory for this object if there isn’t