image & documents links conversion ok

2023-05-23 13:40:32 +02:00 · 2023-05-23 13:40:32 +02:00 · dbfe417870
commit dbfe417870
parent 94b593da93
3 changed files with 49 additions and 22 deletions
--- a/spip2md/converter.py
+++ b/spip2md/converter.py
@ -1,5 +1,5 @@
 # pyright: strict
-from re import I, S, compile, finditer
+from re import I, S, compile, finditer, sub
 from typing import Optional

 # SPIP syntax to Markdown
@ -18,16 +18,16 @@ spip_to_markdown = (
        r"## \1",  # Translate SPIP headings to h2
    ),
    (  # strong
-        compile(r"\{\{ *(.*?) *\}\}", S | I),
-        r"**\1**",
+        compile(r"\{\{ *(.*?) *\}\} ?", S | I),
+        r"**\1** ",
    ),
    (  # html strong
        compile(r"<strong> *(.*?) *</strong>", S | I),
        r"**\1**",
    ),
    (  # emphasis
-        compile(r"\{ *(.*?) *\}", S | I),
-        r"*\1*",
+        compile(r"\{ *(.*?) *\} ?", S | I),
+        r"*\1* ",
    ),
    (  # html emphasis
        compile(r"<i> *(.*?) *<\/i>", S | I),
@ -44,10 +44,6 @@ spip_to_markdown = (
        compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
        r"[\1](\2)",
    ),
-    (  # document anchor
-        compile(r"<(?:doc|emb)(.*?)(\|.*?)*>", S | I),
-        r"[document](\1)",
-    ),
    (  # wikilink
        compile(r"\[\? *(.*?) *\]", S | I),
        r"[\1](https://wikipedia.org/wiki/\1)",
@ -74,7 +70,7 @@ spip_to_markdown = (
    ),
    (  # table-metadata
        compile(r"(\r?\n)\|\|(.*?)\|(.*?)\|\|", S | I),
-        r"",
+        r"",  # Remove it
    ),
    (  # quote
        compile(
@ -97,22 +93,15 @@ spip_to_markdown = (
        ),
        "```\n\\1\n\n```",
    ),
-    (  # Keep only the first language in multi-language blocks
+    (  # WARNING Keep only the first language in multi-language blocks
        compile(
            r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
            S | I,
        ),
        r"\1",
    ),
-    (  # WARNING remove every html tag
-        compile(r"<\/?.*?> *", S | I),
-        r"",
-    ),
 )

-## Match SPIP images
-spip_image = compile(r"<(?:img|image)(.*?)(\|.*?)*>", S | I)
-
 spip_to_text = (
    (  # strong
        compile(r"\{\{ *(.*?) *\}\}", S | I),
@ -148,16 +137,19 @@ spip_to_text = (
        compile(r"<\/?.*?> *", S | I),
        r"",
    ),
-    (  # beginning with angle bracket(s)
+    (  # Remove beginning with angle bracket(s)
        compile(r"^>+ +", S | I),
        r"",
    ),
-    (  # beginning with a number followed by a dot
+    (  # Remove beginning with a number followed by a dot
        compile(r"^\d+\. +", S | I),
        r"",
    ),
 )

+# HTML tag WARNING can be used to remove them all
+html_tag = compile(r"<\/?.*?> *", S | I)
+
 # Broken ISO encoding to proper UTF-8
 iso_to_utf = (
    (  # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
@ -300,6 +292,27 @@ def convert_meta(text: Optional[str]) -> str:
    return text


+# Replace images & documents in SPIP text with Markdown links with human-readable names
+def convert_documents(text: str, documents: list[tuple[int, str, str]]) -> str:
+    for id, name, slug in documents:
+        text = sub(
+            r"<(?:img|image)" + str(id) + r"(\|.*?)*>",
+            f"![{name}]({slug})",
+            text,
+        )
+        text = sub(
+            r"<(?:doc|emb)" + str(id) + r"(\|.*?)*>",
+            f"[{name}]({slug})",
+            text,
+        )
+        text = sub(
+            r"\[(.*?)\]\((?:doc|emb)" + str(id) + r"(\|.*?)*\)",
+            f"[\\1]({slug})",
+            text,
+        )
+    return text
+
+
 # Replace unknown chars with empty strings (delete them)
 def remove_unknown_chars(text: str) -> str:
    for char in unknown_iso:
@ -307,6 +320,11 @@ def remove_unknown_chars(text: str) -> str:
    return text


+# Replace HTML tags chars with empty strings (delete them)
+def remove_tags(text: str) -> str:
+    return html_tag.sub("", text)
+
+
 # Return a list of tuples giving the start and end of unknown substring in text
 def unknown_chars(text: str) -> list[tuple[int, int]]:
    positions: list[tuple[int, int]] = []
--- a/spip2md/items.py
+++ b/spip2md/items.py
@ -5,7 +5,7 @@ from typing import Any, Optional
 from slugify import slugify
 from yaml import dump

-from converter import convert_body, convert_meta
+from converter import convert_body, convert_documents, convert_meta, remove_tags
 from database import (
    SpipArticles,
    SpipAuteurs,
@ -98,6 +98,12 @@ class Article(Item):
        # self.referers: int = article.referers  # USELESS in static
        # self.popularity: float = article.popularite  # USELESS in static
        # self.version = article.id_version  # USELESS
+        # Convert images & files links
+        self.text = convert_documents(
+            self.text, [(d.id, d.title, d.get_slug()) for d, _ in self.get_documents()]
+        )
+        # Remove remaining HTML after
+        self.text = remove_tags(self.text)

    def get_authors(self) -> list[SpipAuteurs]:
        return (
@ -139,6 +145,9 @@ class Article(Item):
            body += "\n\n# MICROBLOGGING\n\n" + self.microblog
        return body

+    def get_documents(self):
+        return Documents(self.id)
+

 class Section(Item):
    def __init__(self, section: SpipRubriques) -> None:
--- a/spip2md/main.py
+++ b/spip2md/main.py
@ -119,7 +119,7 @@ if __name__ == "__main__":  # Only if script is directly executed
            if len(get_unknown_chars(article.text)) > 0:
                unknown_chars_articles.append(article)
            # Loop over article’s related files (images …)
-            for document, counter in Documents(article.id):
+            for document, counter in article.get_documents():
                if counter.count % 100 == 0:
                    s: str = "s" if counter.remaining() > 1 else ""
                    print("    Exporting", end="")