From d20976c59d81c33077e5ebeff49c59ee31efd588 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilhem=20Faur=C3=A9?= Date: Tue, 30 May 2023 12:16:58 +0200 Subject: [PATCH] blocks & unknown chars logging in spip2md.log --- spip2md/__init__.py | 64 ++++++++---------------------------------- spip2md/config.py | 2 ++ spip2md/regexmap.py | 4 ++- spip2md/spipobjects.py | 31 ++++++++++++++++++-- 4 files changed, 45 insertions(+), 56 deletions(-) diff --git a/spip2md/__init__.py b/spip2md/__init__.py index 3902f25..420c865 100644 --- a/spip2md/__init__.py +++ b/spip2md/__init__.py @@ -1,7 +1,6 @@ # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré -# Top level functions import sys -from os import makedirs +from os import makedirs, remove from shutil import rmtree from spip2md.config import CFG @@ -49,64 +48,23 @@ def main(*argv): if CFG.clear_output: rmtree(CFG.output_dir, True) makedirs(CFG.output_dir, exist_ok=True) + # Clear the log file + # if CFG.clear_log: + # remove(CFG.logfile) # Get the virtual id=0 section root: Rubrique = RootRubrique() # Write everything while printing the output human-readably branches, leaves = count_output(root.write_tree(CFG.output_dir)) - # End, summary message - print( - f""" + + DB.close() # Close the connection with the database + + print( # End, summary message + f"""\ Exported a total of {esc(BOLD)}{leaves}{esc()} Markdown files, \ stored into {esc(BOLD)}{branches}{esc()} directories""" ) - # print() # Break line between export & unknown characters warning - # Warn about each article that contains unknown(s) character(s) - # TODO do it with Python warnings - - DB.close() # Close the connection with the database - - -r""" OLD CODE -# Print the detected unknown chars in article in their context but highlighted -def warn_unknown_chars(article: Article) -> None: - # Print the title of the article in which there is unknown characters - # & the number of them - unknown_chars_apparitions: list[str] = unknown_chars_context(article.texte) - nb: int = len(unknown_chars_apparitions) - s: str = "s" if nb > 1 else "" - style(f"{nb}") - print(f" unknown character{s} in", end="") - style(f" {article.lang} ") - highlight(article.titre, *unknown_chars(article.titre)) - print() # Break line - # Print the context in which the unknown characters are found - for text in unknown_chars_apparitions: - style(" … ") - highlight(text, *unknown_chars(text)) - style(" … \n") - print() # Break line - -# Return a list of tuples giving the start and end of unknown substring in text -def unknown_chars(text: str) -> list[tuple[int, int]]: - positions: list[tuple[int, int]] = [] - for char in UNKNOWN_ISO: - for match in finditer("(" + char + ")+", text): - positions.append((match.start(), match.end())) - return positions - -# Return strings with unknown chards found in text, surrounded by context_length chars -def unknown_chars_context(text: str, context_length: int = 24) -> list[str]: - errors: list[str] = [] - context: str = r".{0," + str(context_length) + r"}" - for char in UNKNOWN_ISO: - matches = finditer( - context + r"(?=" + char + r")" + char + context, - text, - ) - for match in matches: - errors.append(match.group()) - return errors -""" + # Warn about issued warnings in log file + print(f"\nThere might be warnings in {esc(BOLD)}{CFG.logfile}{esc()}") diff --git a/spip2md/config.py b/spip2md/config.py index 2f2911c..f0f0777 100644 --- a/spip2md/config.py +++ b/spip2md/config.py @@ -22,10 +22,12 @@ class Configuration: output_dir: str = "output/" data_dir: str = "data/" clear_output: bool = False + clear_log: bool = True prepend_h1: bool = True export_filetype: str = "md" max_articles_export: int = 1000 # TODO reimplement with recursion max_sections_export: int = 500 # TODO reimplement with recursion + logfile: str = "spip2md.log" def __init__(self, config_file: Optional[str] = None): if config_file is not None: diff --git a/spip2md/regexmap.py b/spip2md/regexmap.py index d8f2b4c..0a2332f 100644 --- a/spip2md/regexmap.py +++ b/spip2md/regexmap.py @@ -114,7 +114,9 @@ DOCUMENT_LINK_REPL = r"\1[\2{}]({})" # Multi language block, to be further processed per lang MULTILANG_BLOCK = compile(r"(.+?)<\/multi>", S | I) -MULTILANGS = compile(r"\[([a-zA-Z\-]{2,6})\]\s*(.+?)(?=\[[a-zA-Z\-]{2,6}\]|$)", S | I) +MULTILANGS = compile( + r"\[([a-zA-Z\-]{2,6})\]\s*(.+?)\s*(?=\[[a-zA-Z\-]{2,6}\]|$)", S | I +) # WARNING probably useless text in metadata fields, to be removed BLOAT = ( diff --git a/spip2md/spipobjects.py b/spip2md/spipobjects.py index b78d6d0..c8f1a9b 100644 --- a/spip2md/spipobjects.py +++ b/spip2md/spipobjects.py @@ -1,7 +1,8 @@ # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré +import logging from os import makedirs from os.path import basename, splitext -from re import finditer, sub +from re import finditer, search, sub from shutil import copyfile from typing import Any, Match, Optional @@ -33,6 +34,9 @@ from spip2md.regexmap import ( ) from spip2md.style import BLUE, BOLD, GREEN, WARNING_STYLE, YELLOW, esc +# Output logs to logfile +logging.basicConfig(filename=CFG.logfile, encoding="utf-8") + class SpipWritable: term_color: int @@ -57,13 +61,28 @@ class SpipWritable: first_lang = lang.group(2) else: pass - # print("Found other language for", first_lang, ":", lang.groups()) + logging.warning( + f"Ignored {lang.group(1)} translation of {first_lang[:40]}: " + + lang.group(2)[:40], + ) return first_lang return MULTILANG_BLOCK.sub(replace_lang, text) # Apply different mappings to a text field, like SPIP to Markdown or encoding def convert(self, text: Optional[str]) -> str: + # Return unknown char surrounded by context_length chars + def unknown_chars_context(text: str, char: str, context_len: int = 24) -> str: + context: str = r".{0," + str(context_len) + r"}" + match = search( + context + r"(?=" + char + r")" + char + context, + text, + ) + if match is not None: + return match.group() + else: + return char + if text is not None and len(text) > 0: for spip, markdown in SPIP_MARKDOWN: text = spip.sub(markdown, text) @@ -71,6 +90,14 @@ class SpipWritable: text = bloat.sub("", text) for iso, utf in ISO_UTF: text = text.replace(iso, utf) + for char in UNKNOWN_ISO: + lastend: int = 0 + for match in finditer("(" + char + ")+", text): + context: str = unknown_chars_context(text[lastend:], char) + logging.warn( + f"Unknown char {char} found in {self.titre[:40]} at: {context}" + ) + lastend = match.end() text = self.translate(text) else: return ""