diff --git a/spip2md/__init__.py b/spip2md/__init__.py index 7262275..209d5af 100644 --- a/spip2md/__init__.py +++ b/spip2md/__init__.py @@ -3,12 +3,10 @@ import sys from os import makedirs from shutil import rmtree -from typing import Any - -from peewee import ModelSelect from spip2md.config import CFG from spip2md.database import DB +from spip2md.regexmap import SPECIAL_OUTPUT from spip2md.spipobjects import RootRubrique, Rubrique # Define styles @@ -25,66 +23,25 @@ C1 = 96 # Color C2 = 96 # Color -# Print a stylized string, without trailing newline -def style(string: str, *args: int, end: str = "") -> None: - esc = "\033[" # Terminal escape sequence, needs to be closed by "m" +# Terminal escape sequence +def esc(*args: int) -> str: if len(args) == 0: - params: str = "1;" # Defaults to bold + params: str = "0;" # Defaults to reset else: params: str = "" + # Build a string from args, that will be stripped from its trailing ; for a in args: params += str(a) + ";" - print(esc + params[:-1] + "m" + string + esc + "0m", end=end) - - -# Print a string, highlighting every substring starting at start_stop[x][0] … -def highlight(string: str, *start_stop: tuple[int, int], end: str = "") -> None: - previous_stop = 0 - for start, stop in start_stop: - print(string[previous_stop:start], end="") - style(string[start:stop], BOLD, RED) - previous_stop = stop - print(string[previous_stop:], end=end) - - -# Query the DB to retrieve all sections without parent, sorted by publication date -def root_sections(limit: int = 10**3) -> ModelSelect: - return ( - Rubrique.select() - .where(Rubrique.id_parent == 0) - .order_by(Rubrique.date.desc()) - .limit(limit) - ) - - -r""" -# Print the detected unknown chars in article in their context but highlighted -def warn_unknown_chars(article: Article) -> None: - # Print the title of the article in which there is unknown characters - # & the number of them - unknown_chars_apparitions: list[str] = unknown_chars_context(article.texte) - nb: int = len(unknown_chars_apparitions) - s: str = "s" if nb > 1 else "" - style(f"{nb}") - print(f" unknown character{s} in", end="") - style(f" {article.lang} ") - highlight(article.titre, *unknown_chars(article.titre)) - print() # Break line - # Print the context in which the unknown characters are found - for text in unknown_chars_apparitions: - style(" … ") - highlight(text, *unknown_chars(text)) - style(" … \n") - print() # Break line -""" + # Base terminal escape sequence that needs to be closed by "m" + return "\033[" + params[:-1] + "m" # Print one root section list output correctly # sys.setrecursionlimit(2000) def print_output( - tree: list[Any], + tree: list[str | list[str | list]], indent: str = " ", - depth: int = 0, + depth: int = -1, branches: int = 1, leaves: int = 0, ) -> tuple[int, int]: @@ -93,8 +50,11 @@ def print_output( branches, leaves = print_output( sub, indent, depth + 1, branches + 1, leaves ) - else: + elif type(sub) == str: leaves += 1 + # Highlight special elements (in red for the moment) + for elmnt in SPECIAL_OUTPUT: + sub = elmnt.sub(esc(BOLD, GREEN) + r"\1" + esc(), sub) print(indent * depth + sub) return (branches, leaves) @@ -125,12 +85,58 @@ def main(*argv): root: Rubrique = RootRubrique() # Write everything & print the output human-readably - sections, articles = print_output(root.write_tree(CFG.output_dir)) + branches, leaves = print_output(root.write_tree(CFG.output_dir)) # End, summary message - print(f"Exported a total of {sections} sections, containing {articles} articles") + print( + f""" +Exported a total of {leaves} Markdown files, stored into {branches} directories""" + ) # print() # Break line between export & unknown characters warning # Warn about each article that contains unknown(s) character(s) # TODO do it with Python warnings DB.close() # Close the connection with the database + + +r""" OLD CODE +# Print the detected unknown chars in article in their context but highlighted +def warn_unknown_chars(article: Article) -> None: + # Print the title of the article in which there is unknown characters + # & the number of them + unknown_chars_apparitions: list[str] = unknown_chars_context(article.texte) + nb: int = len(unknown_chars_apparitions) + s: str = "s" if nb > 1 else "" + style(f"{nb}") + print(f" unknown character{s} in", end="") + style(f" {article.lang} ") + highlight(article.titre, *unknown_chars(article.titre)) + print() # Break line + # Print the context in which the unknown characters are found + for text in unknown_chars_apparitions: + style(" … ") + highlight(text, *unknown_chars(text)) + style(" … \n") + print() # Break line + +# Return a list of tuples giving the start and end of unknown substring in text +def unknown_chars(text: str) -> list[tuple[int, int]]: + positions: list[tuple[int, int]] = [] + for char in UNKNOWN_ISO: + for match in finditer("(" + char + ")+", text): + positions.append((match.start(), match.end())) + return positions + +# Return strings with unknown chards found in text, surrounded by context_length chars +def unknown_chars_context(text: str, context_length: int = 24) -> list[str]: + errors: list[str] = [] + context: str = r".{0," + str(context_length) + r"}" + for char in UNKNOWN_ISO: + matches = finditer( + context + r"(?=" + char + r")" + char + context, + text, + ) + for match in matches: + errors.append(match.group()) + return errors +""" diff --git a/spip2md/regexmap.py b/spip2md/regexmap.py index 277eb03..8fe481e 100644 --- a/spip2md/regexmap.py +++ b/spip2md/regexmap.py @@ -256,31 +256,7 @@ UNKNOWN_ISO = ( # Special elements in terminal output to surround SPECIAL_OUTPUT = ( - (compile(r"^([0-9]+?\.)(?= )"), r"{}\1{}"), # Counter - (compile(r"(?<= )->(?= )"), r"{}->{}"), # Arrow - (compile(r"(?<=^Exporting )([0-9]+?)(?= )"), r"{}\1{}"), # Total + compile(r"^([0-9]+?\.)(?= )"), # Counter + compile(r"(?<= )(->)(?= )"), # Arrow + compile(r"(?<=^Exporting )([0-9]+?)(?= )"), # Total ) - - -r""" -# Return a list of tuples giving the start and end of unknown substring in text -def unknown_chars(text: str) -> list[tuple[int, int]]: - positions: list[tuple[int, int]] = [] - for char in UNKNOWN_ISO: - for match in finditer("(" + char + ")+", text): - positions.append((match.start(), match.end())) - return positions - -# Return strings with unknown chards found in text, surrounded by context_length chars -def unknown_chars_context(text: str, context_length: int = 24) -> list[str]: - errors: list[str] = [] - context: str = r".{0," + str(context_length) + r"}" - for char in UNKNOWN_ISO: - matches = finditer( - context + r"(?=" + char + r")" + char + context, - text, - ) - for match in matches: - errors.append(match.group()) - return errors -""" diff --git a/spip2md/spipobjects.py b/spip2md/spipobjects.py index f2f4ef5..917ccc6 100644 --- a/spip2md/spipobjects.py +++ b/spip2md/spipobjects.py @@ -406,6 +406,7 @@ class RootRubrique(Rubrique): # 0 ID self.id_rubrique = 0 # self.object_id = 0 + self.profondeur = 0 def write_tree( self, parent_dir: str, sections_limit: int = 0, articles_limit: int = 0 @@ -414,9 +415,11 @@ class RootRubrique(Rubrique): output: list[str | list[Any]] = [] # Starting message output.append( - f"Begin converting {CFG.db}@{CFG.db_host} db to plain Markdown+YAML files" + f"""\ +Begin exporting `{CFG.db}@{CFG.db_host}` SPIP database to plain Markdown+YAML +files into the directory `{parent_dir}`, as database user `{CFG.db_user}` +""" ) - output.append(f" as db user {CFG.db_user}, into the directory {parent_dir}") # Get all child section of self child_sections = ( Rubrique.select()