From 32738a9269d947076421c6f1be849c55ca6aee69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guilhem=20Faur=C3=A9?= <pro@gfaure.eu>
Date: Fri, 26 May 2023 14:43:39 +0200
Subject: [PATCH] separation between classes, functions & regex mappings

---
 spip2md/__init__.py    |  29 ++++--------
 spip2md/regexmap.py    | 104 +++++++++++++++--------------------------
 spip2md/spipobjects.py |  57 ++++++++++++++++------
 3 files changed, 91 insertions(+), 99 deletions(-)
diff --git a/spip2md/__init__.py b/spip2md/__init__.py
index 3c50d5e..f67b322 100644
--- a/spip2md/__init__.py
+++ b/spip2md/__init__.py
@@ -9,11 +9,7 @@ from peewee import ModelSelect
 
 from spip2md.config import CFG
 from spip2md.database import DB
-from spip2md.regexmap import unknown_chars, unknown_chars_context
-from spip2md.spipobjects import (
-    Article,
-    Rubrique,
-)
+from spip2md.spipobjects import Rubrique
 
 # Define styles
 BOLD = 1  # Bold
@@ -60,13 +56,7 @@ def root_sections(limit: int = 10**3) -> ModelSelect:
         .limit(limit)
     )
 
-
-def has_unknown_chars(article: Article) -> bool:
-    if len(unknown_chars_context(article.texte)) > 0:
-        return True
-    return False
-
-
+r"""
 # Print the detected unknown chars in article in their context but highlighted
 def warn_unknown_chars(article: Article) -> None:
     # Print the title of the article in which there is unknown characters
@@ -85,6 +75,7 @@ def warn_unknown_chars(article: Article) -> None:
         highlight(text, *unknown_chars(text))
         style(" … \n")
     print()  # Break line
+"""
 
 
 # Print one root section list output correctly
@@ -106,16 +97,16 @@ DB.connect()
 def main(*argv):
     if len(argv) == 0:
         argv = sys.argv
-    # Define max nb of articles to export based on first CLI argument
+    # Define max nb of sections to export based on first CLI argument TODO
     if len(argv) >= 2:
-        articles_export = int(argv[1])
-    else:
-        articles_export = CFG.max_articles_export
-    # Define max nb of sections to export based on second CLI argument
-    if len(argv) >= 3:
-        sections_export = int(argv[2])
+        sections_export = int(argv[1])
     else:
         sections_export = CFG.max_sections_export
+    # Define max nb of articles to export based on second CLI argument TODO
+    # if len(argv) >= 3:
+    #     articles_export = int(argv[2])
+    # else:
+    #     articles_export = CFG.max_articles_export
 
     # Clear the output dir & create a new
     if CFG.clear_output:
diff --git a/spip2md/regexmap.py b/spip2md/regexmap.py
index 5222286..314f0e1 100644
--- a/spip2md/regexmap.py
+++ b/spip2md/regexmap.py
@@ -1,10 +1,9 @@
 # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
 # pyright: strict
-from re import I, S, compile, finditer, sub
-from typing import Optional
+from re import I, S, compile
 
-# SPIP syntax to Markdown
-SPIP_TO_MARKDOWN = (
+# ((SPIP syntax, Replacement Markdown syntax), …)
+SPIP_MARKDOWN = (
     (  # horizontal rule
         compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I),
         # r"---",
@@ -43,15 +42,15 @@ SPIP_TO_MARKDOWN = (
     ),
     (  # images
         compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I),
-        r"![](\1\2)",
+        r"![](\2)",  # Needs to be further processed to replace ID with filename
     ),
     (  # documents & embeds
         compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I),
-        r"[](\1\2)",
+        r"[](\2)",  # Needs to be further processed to replace ID with filename
     ),
     (  # internal links
         compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I),
-        r"[](\1\2)",
+        r"[](\2)",  # Needs to be further processed to replace ID with filename
     ),
     (  # anchor
         compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
@@ -106,20 +105,31 @@ SPIP_TO_MARKDOWN = (
         ),
         "```\n\\1\n\n```",
     ),
-    (  # WARNING remove every html tag
-        compile(r"<\/?.*?>\s*", S | I),
-        r"",
-    ),
 )
 
-# Further cleaning for metadata texts such as titles or descriptions
-SPIP_META_BLOAT = (
+# Match against documents ID found in links, ID can be inserted with .format()
+# Name and path can be further replaced with .format()
+DOCUMENT_LINK = r"(!)?\[(.*?)\]\(({})\)"
+DOCUMENT_LINK_REPL = r"\1[\2{}]({})"
+
+# Multi language block, capture groups: (lang, text, lang, text, …)
+MULTILANG = compile(
+    r"<multi>(?:\s*\[(.{2,6})\]\s*(.*?)\s*)+<\/multi>",
+    S | I,
+)
+
+# WARNING probably useless text in metadata fields, to be removed
+BLOAT = (
     compile(r"^>+ +", S | I),  # Remove beginning with angle bracket(s)
     compile(r"^\d+\. +", S | I),  # Remove beginning with a number followed by a dot
 )
 
-# Broken ISO encoding to proper UTF-8
-ISO_TO_UTF = (
+# Matches against every HTML tag
+HTMLTAG = compile(r"<\/?.*?>\s*", S | I)
+
+
+# ((Broken ISO 8859-1 encoding, Proper UTF equivalent encoding), …)
+ISO_UTF = (
     (  # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
         "â€™",
         r"’",
@@ -224,7 +234,7 @@ ISO_TO_UTF = (
         "iÌ\u0081",
         r"í",
     ),
-    # WARNING not sure
+    # WARNING not sure below
     (  # Fix UTF-8 é that was interpreted as ISO 8859-1
         "eÌ ",
         r"é",
@@ -239,62 +249,22 @@ ISO_TO_UTF = (
     ),
 )
 
-# WARNING unknown broken encoding
+# WARNING broken ISO 8859-1 encoding which I don’t know the UTF equivalent
 UNKNOWN_ISO = (
-    r"â€¨",
-    r"âˆ†",
-)
-
-# Multi language block, capture the first
-MULTILINGUAL = compile(
-    r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
-    S | I,
+    "â€¨",
+    "âˆ†",
 )
 
 
-# Apply SPIP to Markdown & ISO to UTF conversions to a text, & eventually clean meta
-def convert(text: Optional[str], clean_meta: bool = False) -> str:
-    if text is None:
-        return ""
-    for spip, markdown in SPIP_TO_MARKDOWN:
-        text = spip.sub(markdown, text)
-    if clean_meta:
-        for bloat in SPIP_META_BLOAT:
-            text = bloat.sub("", text)
-    for iso, utf in ISO_TO_UTF:
-        text = text.replace(iso, utf)
-    return text
-
-
-# Replace images & files links in Markdown with real slugs of the actually linked files
-def link_document(text: str, id: int, name: str, slug: str) -> str:
-    # Replace images that dont have a title written in text
-    text = sub(
-        r"!\[]\((?:img|image)" + str(id) + r"(\|.*?)*\)",
-        f"![{name}]({slug})",
-        text,
-    )
-    # Replace images that dont have a title written in text
-    text = sub(
-        r"\[]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)",
-        f"[{name}]({slug})",
-        text,
-    )
-    # Replace images that already had a title in Markdown style link
-    text = sub(
-        r"!\[(.+?)\]\((?:img|image)" + str(id) + r"(\|.*?)*\)",
-        f"![\\1]({slug})",
-        text,
-    )
-    # Replace documents that already had a title in Markdown style link
-    text = sub(
-        r"\[(.+?)\]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)",
-        f"[\\1]({slug})",
-        text,
-    )
-    return text
+# Special elements in terminal output to surround
+SPECIAL_OUTPUT = (
+    (compile(r"^([0-9]+?\.)(?= )"), r"{}\1{}"),  # Counter
+    (compile(r"(?<= )->(?= )"), r"{}->{}"),  # Arrow
+    (compile(r"(?<=^Exporting )([0-9]+?)(?= )"), r"{}\1{}"),  # Total
+)
 
 
+r"""
 # Return a list of tuples giving the start and end of unknown substring in text
 def unknown_chars(text: str) -> list[tuple[int, int]]:
     positions: list[tuple[int, int]] = []
@@ -303,7 +273,6 @@ def unknown_chars(text: str) -> list[tuple[int, int]]:
             positions.append((match.start(), match.end()))
     return positions
 
-
 # Return strings with unknown chards found in text, surrounded by context_length chars
 def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
     errors: list[str] = []
@@ -316,3 +285,4 @@ def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
         for match in matches:
             errors.append(match.group())
     return errors
+"""
diff --git a/spip2md/spipobjects.py b/spip2md/spipobjects.py
index a2abf14..ee4ffe5 100644
--- a/spip2md/spipobjects.py
+++ b/spip2md/spipobjects.py
@@ -1,7 +1,7 @@
 # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
 from os import makedirs
 from os.path import basename, splitext
-from re import finditer
+from re import finditer, sub
 from shutil import copyfile
 from typing import Any, Optional
 
@@ -18,7 +18,16 @@ from spip2md.database import (
     SpipDocumentsLiens,
     SpipRubriques,
 )
-from spip2md.regexmap import convert, link_document, unknown_chars
+from spip2md.regexmap import (
+    BLOAT,
+    DOCUMENT_LINK,
+    DOCUMENT_LINK_REPL,
+    HTMLTAG,
+    ISO_UTF,
+    MULTILANG,
+    SPIP_MARKDOWN,
+    UNKNOWN_ISO,
+)
 
 
 class SpipWritable:
@@ -26,6 +35,7 @@ class SpipWritable:
     texte: str
     lang: str
     titre: str
+    profondeur: int
 
     def filename(self, date: bool = False) -> str:
         raise NotImplementedError(
@@ -49,6 +59,19 @@ class SpipWritable:
             output[-1] += "MISSING NAME"
         return output
 
+    # Apply different mappings to text fields, like SPIP to Markdown or encoding
+    def convert_attrs(self, *attrs: str) -> None:
+        attrs += "titre", "descriptif"
+        for attr in attrs:
+            a = getattr(self, attr)
+            if len(a) > 0:
+                for spip, markdown in SPIP_MARKDOWN:
+                    setattr(self, attr, spip.sub(markdown, a))
+                for bloat in BLOAT:
+                    setattr(self, attr, bloat.sub("", a))
+                for iso, utf in ISO_UTF:
+                    setattr(self, attr, a.replace(iso, utf))
+
     # Write object to output destination
     def write(self, parent_dir: str) -> str:
         raise NotImplementedError(
@@ -69,8 +92,6 @@ class Document(SpipWritable, SpipDocuments):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.titre: str = convert(self.titre, True)
-        self.descriptif: str = convert(self.descriptif, True)
         self.statut: str = "false" if self.statut == "publie" else "true"
 
     # Get slugified name of this file
@@ -86,6 +107,8 @@ class Document(SpipWritable, SpipDocuments):
 
     # Write document to output destination
     def write(self, parent_dir: str) -> str:
+        # Apply needed conversions
+        super().convert_attrs()
         # Define file source and destination
         src: str = CFG.data_dir + self.fichier
         dest: str = parent_dir + self.filename()
@@ -100,23 +123,25 @@ class SpipObject(SpipWritable):
     date: DateTimeField
     maj: str
     id_secteur: int
+    descriptif: str
+    extra: str
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         # Common fields that need conversions
-        self.titre: str = convert(self.titre, True)
-        self.descriptif: str = convert(self.descriptif, True)
-        self.texte: str = convert(self.texte)  # Convert SPIP to Markdown
         self.statut: str = "false" if self.statut == "publie" else "true"
         self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
-        self.extra: str = convert(self.extra)  # Probably unused
         # Define file prefix (needs to be redefined for sections)
         self.prefix = "index"
 
     # Convert SPIP style internal links for images & other files into Markdown style
     def link_documents(self, documents: ModelSelect) -> None:
         for d in documents:
-            self.texte = link_document(self.texte, d.id_document, d.titre, d.filename())
+            self.texte = sub(
+                DOCUMENT_LINK.format(d.id_document),
+                DOCUMENT_LINK_REPL.format(d.titre, d.filename()),
+                self.texte,
+            )
 
     # Output related documents & link them in the text by the way
     def documents(self, link_documents: bool = True) -> ModelSelect:
@@ -198,8 +223,13 @@ class SpipObject(SpipWritable):
             body += "\n\n# EXTRA\n\n" + self.extra
         return body
 
+    def convert_attrs(self, *attrs: str) -> None:
+        return super().convert_attrs(*attrs, "descriptif", "extra")
+
     # Write object to output destination
     def write(self, parent_dir: str) -> str:
+        # Apply needed conversions
+        super().convert_attrs()
         # Define actual export directory
         directory: str = parent_dir + self.dir_slug()
         # Make a directory for this object if there isn’t
@@ -219,14 +249,15 @@ class Article(SpipObject, SpipArticles):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         # More conversions needed for articles
-        self.surtitre: str = convert(self.surtitre, True)  # Probably unused
-        self.soustitre: str = convert(self.soustitre, True)  # Probably unused
-        self.chapo: str = convert(self.chapo)  # Probably unused
-        self.ps: str = convert(self.ps)  # Probably unused
         self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
         # ID
         self.object_id = self.id_article
 
+    def convert_attrs(self, *attrs: str) -> None:
+        return super().convert_attrs(
+            *attrs, "surtitre", "soustitre", "chapo", "ps", "accepter_forum"
+        )
+
     def frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
         meta: dict[str, Any] = {
             # Article specific