unknown characters highlighting & reporting

2023-05-16 11:19:47 +02:00 · 2023-05-16 11:19:47 +02:00 · b61853a4d5
commit b61853a4d5
parent 12db0375e7
3 changed files with 80 additions and 41 deletions
--- a/spip2md/converter.py
+++ b/spip2md/converter.py
@ -253,32 +253,30 @@ isoToUtf = (
 ## WARNING unknown broken encoding
 unknownIso = (
-    compile(r"\w*â€¨.*\r?\n"),  # unknown â€¨ + surroundings
+    (  # unknown â€¨ + surroundings
-    compile(r"\w*âˆ†.*\r?\n"),  # unknown â^† + surroundings
+        compile(r"â€¨"),
        compile(r"â€¨.*(?=\r?\n|$)"),
    ),
    (  # unknown â^† + surroundings
        compile(r"âˆ†"),
        compile(r"âˆ†.*(?=\r?\n|$)"),
    ),
 )
 def convertBody(spipBody):
-    text = spipBody
+    text: str = spipBody
    errors = []
    for spip, markdown in spipToMarkdown:
        text = spip.sub(markdown, text)
    for iso, utf in isoToUtf:
        text = iso.sub(utf, text)
-    for iso in unknownIso:
+    return text
        for match in iso.finditer(text):
            errors.append(match.group())
    return text, errors
 def convertMeta(spipMeta):
-    text = spipMeta
+    text: str = spipMeta
    errors = []
    for spip, metadata in spipToText:
        text = spip.sub(metadata, text)
    for iso, utf in isoToUtf:
        text = iso.sub(utf, text)
-    for iso in unknownIso:
+    return text
        for match in iso.finditer(text):
            errors.append(match.group())
    return text, errors
--- a/spip2md/iterator.py
+++ b/spip2md/iterator.py
@ -1,6 +1,6 @@
-from array import array
+from re import escape
-from converter import convertBody, convertMeta
+from converter import convertBody, convertMeta, unknownIso
 from database import *
 from slugify import slugify
 # from yaml import CDumper as Dumper
@ -11,12 +11,12 @@ class Article:
    def __init__(self, article):
        self.id = article.id_article
        # self.surtitle = article.surtitre  # Probably unused
-        self.title, self.title_unknown = convertMeta(article.titre)
+        self.title = convertMeta(article.titre)
        self.subtitle = article.soustitre  # Probably unused
        # self.section = article.id_rubrique # TODO join
-        self.description, self.description_unknown = convertMeta(article.descriptif)
+        self.description = convertMeta(article.descriptif)
        self.caption = article.chapo  # Probably unused
-        self.text, self.text_unknown = convertBody(article.texte)  # Markdown
+        self.text = convertBody(article.texte)  # Markdown
        self.ps = article.ps  # Probably unused
        self.publicationDate = article.date
        self.draft = False if article.statut == "publie" else True
@ -38,16 +38,16 @@ class Article:
        self.virtual = article.virtuel  # TODO Why ?
        self.microblog = article.microblog  # Probably unused
-    def get_slug(self):
+    def getSlug(self):
        return slugify(f"{self.id}-{self.title}")
-    def get_path(self):
+    def getPath(self):
-        return self.get_slug()
+        return self.getSlug()
-    def get_authors(self):
+    def getAuthors(self):
        return SpipAuteursLiens.select().where(SpipAuteursLiens.id_objet == self.id)
-    def get_frontmatter(self):
+    def getFrontmatter(self):
        return dump(
            {
                "lang": self.lang,
@ -58,14 +58,14 @@ class Article:
                "lastmod": self.update,
                "draft": self.draft,
                "description": self.description,
-                "authors": [author.id_auteur for author in self.get_authors()],
+                "authors": [author.id_auteur for author in self.getAuthors()],
            },
            allow_unicode=True,
        )
-    def get_article(self):
+    def getArticle(self):
        # Build the final article text
-        article: str = "---\n" + self.get_frontmatter() + "---"
+        article: str = "---\n" + self.getFrontmatter() + "---"
        # If there is a caption, add the caption followed by a hr
        if len(self.caption) > 0:
            article += "\n\n" + self.caption + "\n\n***"
@ -83,10 +83,34 @@ class Article:
            article += "\n\n# MICROBLOGGING\n\n" + self.microblog
        return article
    def getUnknownChars(self):
        errors: list = []
        for text in (self.title, self.text):
            for _, surrounding in unknownIso:
                for match in surrounding.finditer(text):
                    errors.append(match.group())
        return errors
 def highlightUnknownChars(text):
    # Define terminal escape sequences to stylize output, regex escaped
    COLOR = "\033[91m" + "\033[1m"  # Red + Bold
    RESET = "\033[0m"
    # Highlight in COLOR unknown chars in text
    for char, _ in unknownIso:
        for match in char.finditer(text):
            text = (
                text[: match.start()]
                + COLOR
                + match.group()
                + RESET
                + text[match.end() :]
            )
    return text
 class Articles:
    exported: int = 0
    unknownChars: list = []
    def __init__(self, maxToExport) -> None:
        # Query the DB to retrieve all articles sorted by publication date
@ -103,9 +127,10 @@ class Articles:
    def __next__(self):
        if self.remaining() <= 0:
-            raise StopIteration()
+            raise StopIteration
        self.exported += 1
        article = Article(self.articles[self.exported - 1])
        return (
            {"exported": self.exported, "remaining": self.remaining()},
-            Article(self.articles[self.exported - 1]),
+            article,
        )
--- a/spip2md/main.py
+++ b/spip2md/main.py
@ -1,7 +1,7 @@
 #!python
 from config import config
 from database import db
-from iterator import Articles
+from iterator import Articles, highlightUnknownChars
 if __name__ != "__main__":
    exit()
@ -25,12 +25,13 @@ else:
    maxToExport = config.defaultNbToExport
 # Define terminal escape sequences to stylize output
-R = "\033[91m"
+R: str = "\033[91m"
-G = "\033[92m"
+G: str = "\033[92m"
-B = "\033[94m"
+B: str = "\033[94m"
-BOLD = "\033[1m"
+BOLD: str = "\033[1m"
-UNDERLINE = "\033[4m"
+RESET: str = "\033[0m"
-RESET = "\033[0m"
+
 unknownChars: dict = {}
 # Loop among first maxToExport articles & export them
 for counter, article in Articles(maxToExport):
@ -39,12 +40,27 @@ for counter, article in Articles(maxToExport):
            f"\n{BOLD}Exporting {R}{counter['remaining']+1}{RESET}"
            + f"{BOLD} SPIP articles to Markdown & YAML files{RESET}\n"
        )
-    print(f"{BOLD}{counter['exported']}.{RESET} {article.title}")
+    print(
-    fullPath = config.outputDir + "/" + article.get_path()
+        f"{BOLD}{counter['exported']}.{RESET} " + highlightUnknownChars(article.title)
-    print(f"\t-> {fullPath}/index.md")
+    )
    fullPath = config.outputDir + "/" + article.getPath()
    print(f"{BOLD}>{RESET} {fullPath}/index.md")
    mkdir(fullPath)
    with open(fullPath + "/index.md", "w") as f:
-        f.write(article.get_article())
+        f.write(article.getArticle())
    # Store detected unknown characters
    if len(article.getUnknownChars()) > 0:
        unknownChars[article.title] = article.getUnknownChars()
 for title in unknownChars:
    nb = len(unknownChars[title])
    print(
        f"\n{BOLD}{nb} "
        + f"unknown character{'s' if nb > 1 else ''} detected in{RESET} " +
            highlightUnknownChars(title)
    )
    for text in unknownChars[title]:
        print(f"  {BOLD}…{RESET} " + highlightUnknownChars(text))
 # Close the database connection
 db.close()