<multi> blocks & unknown chars logging in spip2md.log

2023-05-30 12:16:58 +02:00 · 2023-05-30 12:16:58 +02:00 · d20976c59d
commit d20976c59d
parent 93fc0862d6
4 changed files with 45 additions and 56 deletions
--- a/spip2md/init.py
+++ b/spip2md/init.py
@ -1,7 +1,6 @@
 # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
 # Top level functions
 import sys
-from os import makedirs
+from os import makedirs, remove
 from shutil import rmtree
 from spip2md.config import CFG
@ -49,64 +48,23 @@ def main(*argv):
    if CFG.clear_output:
        rmtree(CFG.output_dir, True)
    makedirs(CFG.output_dir, exist_ok=True)
    # Clear the log file
    # if CFG.clear_log:
    #     remove(CFG.logfile)
    # Get the virtual id=0 section
    root: Rubrique = RootRubrique()
    # Write everything while printing the output human-readably
    branches, leaves = count_output(root.write_tree(CFG.output_dir))
-    # End, summary message
+
-    print(
+    DB.close()  # Close the connection with the database
-        f"""
+
    print(  # End, summary message
        f"""\
 Exported a total of {esc(BOLD)}{leaves}{esc()} Markdown files, \
 stored into {esc(BOLD)}{branches}{esc()} directories"""
    )
-    # print()  # Break line between export & unknown characters warning
+    # Warn about issued warnings in log file
-    # Warn about each article that contains unknown(s) character(s)
+    print(f"\nThere might be warnings in {esc(BOLD)}{CFG.logfile}{esc()}")
    # TODO do it with Python warnings
    DB.close()  # Close the connection with the database
 r""" OLD CODE
 # Print the detected unknown chars in article in their context but highlighted
 def warn_unknown_chars(article: Article) -> None:
    # Print the title of the article in which there is unknown characters
    # & the number of them
    unknown_chars_apparitions: list[str] = unknown_chars_context(article.texte)
    nb: int = len(unknown_chars_apparitions)
    s: str = "s" if nb > 1 else ""
    style(f"{nb}")
    print(f" unknown character{s} in", end="")
    style(f" {article.lang} ")
    highlight(article.titre, *unknown_chars(article.titre))
    print()  # Break line
    # Print the context in which the unknown characters are found
    for text in unknown_chars_apparitions:
        style("  … ")
        highlight(text, *unknown_chars(text))
        style(" … \n")
    print()  # Break line
 # Return a list of tuples giving the start and end of unknown substring in text
 def unknown_chars(text: str) -> list[tuple[int, int]]:
    positions: list[tuple[int, int]] = []
    for char in UNKNOWN_ISO:
        for match in finditer("(" + char + ")+", text):
            positions.append((match.start(), match.end()))
    return positions
 # Return strings with unknown chards found in text, surrounded by context_length chars
 def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
    errors: list[str] = []
    context: str = r".{0," + str(context_length) + r"}"
    for char in UNKNOWN_ISO:
        matches = finditer(
            context + r"(?=" + char + r")" + char + context,
            text,
        )
        for match in matches:
            errors.append(match.group())
    return errors
 """
--- a/spip2md/config.py
+++ b/spip2md/config.py
@ -22,10 +22,12 @@ class Configuration:
    output_dir: str = "output/"
    data_dir: str = "data/"
    clear_output: bool = False
    clear_log: bool = True
    prepend_h1: bool = True
    export_filetype: str = "md"
    max_articles_export: int = 1000  # TODO reimplement with recursion
    max_sections_export: int = 500  # TODO reimplement with recursion
    logfile: str = "spip2md.log"
    def __init__(self, config_file: Optional[str] = None):
        if config_file is not None:
--- a/spip2md/regexmap.py
+++ b/spip2md/regexmap.py
@ -114,7 +114,9 @@ DOCUMENT_LINK_REPL = r"\1[\2{}]({})"
 # Multi language block, to be further processed per lang
 MULTILANG_BLOCK = compile(r"<multi>(.+?)<\/multi>", S | I)
-MULTILANGS = compile(r"\[([a-zA-Z\-]{2,6})\]\s*(.+?)(?=\[[a-zA-Z\-]{2,6}\]|$)", S | I)
+MULTILANGS = compile(
    r"\[([a-zA-Z\-]{2,6})\]\s*(.+?)\s*(?=\[[a-zA-Z\-]{2,6}\]|$)", S | I
 )
 # WARNING probably useless text in metadata fields, to be removed
 BLOAT = (
--- a/spip2md/spipobjects.py
+++ b/spip2md/spipobjects.py
@ -1,7 +1,8 @@
 # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
 import logging
 from os import makedirs
 from os.path import basename, splitext
-from re import finditer, sub
+from re import finditer, search, sub
 from shutil import copyfile
 from typing import Any, Match, Optional
@ -33,6 +34,9 @@ from spip2md.regexmap import (
 )
 from spip2md.style import BLUE, BOLD, GREEN, WARNING_STYLE, YELLOW, esc
 # Output logs to logfile
 logging.basicConfig(filename=CFG.logfile, encoding="utf-8")
 class SpipWritable:
    term_color: int
@ -57,13 +61,28 @@ class SpipWritable:
                    first_lang = lang.group(2)
                else:
                    pass
-                    # print("Found other language for", first_lang, ":", lang.groups())
+                    logging.warning(
                        f"Ignored {lang.group(1)} translation of {first_lang[:40]}: "
                        + lang.group(2)[:40],
                    )
            return first_lang
        return MULTILANG_BLOCK.sub(replace_lang, text)
    # Apply different mappings to a text field, like SPIP to Markdown or encoding
    def convert(self, text: Optional[str]) -> str:
        # Return unknown char surrounded by context_length chars
        def unknown_chars_context(text: str, char: str, context_len: int = 24) -> str:
            context: str = r".{0," + str(context_len) + r"}"
            match = search(
                context + r"(?=" + char + r")" + char + context,
                text,
            )
            if match is not None:
                return match.group()
            else:
                return char
        if text is not None and len(text) > 0:
            for spip, markdown in SPIP_MARKDOWN:
                text = spip.sub(markdown, text)
@ -71,6 +90,14 @@ class SpipWritable:
                text = bloat.sub("", text)
            for iso, utf in ISO_UTF:
                text = text.replace(iso, utf)
            for char in UNKNOWN_ISO:
                lastend: int = 0
                for match in finditer("(" + char + ")+", text):
                    context: str = unknown_chars_context(text[lastend:], char)
                    logging.warn(
                        f"Unknown char {char} found in {self.titre[:40]} at: {context}"
                    )
                    lastend = match.end()
            text = self.translate(text)
        else:
            return ""