diff --git a/spip2md/__init__.py b/spip2md/__init__.py index 1c664f8..df8b28c 100644 --- a/spip2md/__init__.py +++ b/spip2md/__init__.py @@ -1,6 +1,8 @@ # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré +import logging import sys -from os import makedirs +from os import makedirs, remove +from os.path import isfile from shutil import rmtree from spip2md.config import CFG @@ -27,6 +29,18 @@ def count_output( return (branches, leaves) +# Clear the previous log file if needed +if CFG.clear_log and isfile(CFG.logfile): + remove(CFG.logfile) +# Configure logging +logging.basicConfig( + format="%(levelname)s:%(message)s", + filename=CFG.logfile, + encoding="utf-8", + level=CFG.loglevel, +) + + # Connect to the MySQL database with Peewee ORM DB.init(CFG.db, host=CFG.db_host, user=CFG.db_user, password=CFG.db_pass) DB.connect() @@ -64,4 +78,4 @@ stored into {esc(BOLD)}{branches}{esc()} directories""" ) # Warn about issued warnings in log file - print(f"\nThere might be warnings in {esc(BOLD)}{CFG.logfile}{esc()}") + print(f"\nThere might be warnings and infos in {esc(BOLD)}{CFG.logfile}{esc()}") diff --git a/spip2md/config.py b/spip2md/config.py index f0f0777..05caaef 100644 --- a/spip2md/config.py +++ b/spip2md/config.py @@ -25,9 +25,10 @@ class Configuration: clear_log: bool = True prepend_h1: bool = True export_filetype: str = "md" - max_articles_export: int = 1000 # TODO reimplement with recursion - max_sections_export: int = 500 # TODO reimplement with recursion logfile: str = "spip2md.log" + loglevel: str = "INFO" + # max_articles_export: int = 1000 # TODO reimplement with recursion + # max_sections_export: int = 500 # TODO reimplement with recursion def __init__(self, config_file: Optional[str] = None): if config_file is not None: diff --git a/spip2md/regexmap.py b/spip2md/regexmap.py index ef9616c..2d348ca 100644 --- a/spip2md/regexmap.py +++ b/spip2md/regexmap.py @@ -137,6 +137,17 @@ ARTICLE_LINK = ( ), ) # Name and path can be further replaced with .format() +SECTION_LINK = ( + ( # SPIP style documents & embeds links + compile(r"<()(?:rub|rubrique)([0-9]+)(?:\|(.*?))?>", S | I), + r"[{}]({})", + ), + ( # Markdown style internal links + compile(r"\[(.*?)\]\((?:rub|rubrique)([0-9]+)(?:\|(.*?))?\)", S | I), + r"[\1{}]({})", + ), +) # Name and path can be further replaced with .format() + # Multi language block, to be further processed per lang MULTILANG_BLOCK = compile(r"(.+?)<\/multi>", S | I) MULTILANGS = compile( diff --git a/spip2md/spipobjects.py b/spip2md/spipobjects.py index d07aa11..fdba927 100644 --- a/spip2md/spipobjects.py +++ b/spip2md/spipobjects.py @@ -1,12 +1,12 @@ # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré import logging -from os import makedirs, remove +from os import makedirs from os.path import basename, splitext from re import finditer, search from shutil import copyfile from typing import Any, Match, Optional -from peewee import BigAutoField, DateTimeField, ModelSelect +from peewee import BigAutoField, DateTimeField, DoesNotExist, ModelSelect from slugify import slugify from yaml import dump @@ -27,6 +27,7 @@ from spip2md.regexmap import ( ISO_UTF, MULTILANG_BLOCK, MULTILANGS, + SECTION_LINK, SPECIAL_OUTPUT, SPIP_MARKDOWN, UNKNOWN_ISO, @@ -34,12 +35,6 @@ from spip2md.regexmap import ( ) from spip2md.style import BLUE, BOLD, GREEN, WARNING_STYLE, YELLOW, esc -# Clear the previous log file if needed -if CFG.clear_log: - remove(CFG.logfile) -# Output logs to logfile -logging.basicConfig(filename=CFG.logfile, encoding="utf-8") - class SpipWritable: term_color: int @@ -63,10 +58,10 @@ class SpipWritable: # Outputs the first lang associated text first_lang = lang.group(2) else: - pass + title: str = first_lang[:40].strip(" \n") + translate: str = lang.group(2)[:40].strip(" \n") logging.warning( - f"Ignored {lang.group(1)} translation of {first_lang[:40]}: " - + lang.group(2)[:40], + f"Ignored {lang.group(1)} translation of {title}: {translate}", ) return first_lang @@ -215,38 +210,44 @@ class SpipObject(SpipWritable): extra: str def convert(self, text: Optional[str], clean_html: bool = True) -> str: + def found_replace(path_link: str, doc: Any, text: str, match: Match) -> str: + repl: str = path_link.format(doc.titre, doc.filename()) + logging.info(f"Translating link to {repl}") + return text.replace(match.group(), repl) + + def not_found_warn(path_link: str, text: str, match: Match) -> str: + logging.warn(f"No object for link {match.group()} in {self.titre}") + return text.replace(match.group(), path_link.format("", "NOT FOUND")) + if text is not None and len(text) > 0: for id_link, path_link in DOCUMENT_LINK: for match in id_link.finditer(text): - doc: Document = Document.get(Document.id_document == match.group(2)) - if doc is not None: - text = text.replace( - match.group(), path_link.format(doc.titre, doc.filename()) - ) - else: - logging.warn( - f"No document for link {match.group()} in {self.titre}" - ) - text = text.replace( - match.group(), path_link.format("", "NOT FOUND") + logging.info(f"Found document link {match.group()} in {self.titre}") + try: + doc: Document = Document.get( + Document.id_document == match.group(2) ) + text = found_replace(path_link, doc, text, match) + except DoesNotExist: + text = not_found_warn(path_link, text, match) for id_link, path_link in ARTICLE_LINK: for match in id_link.finditer(text): - art: Article = Article.get(Article.id_article == match.group(2)) - if art is not None: - text = text.replace( - match.group(), - path_link.format( - art.titre, f"{art.dir_slug()}/{art.filename()}" - ), - ) - else: - logging.warn( - f"No article for link {match.group()} in {self.titre}" - ) - text = text.replace( - match.group(), path_link.format("", "NOT FOUND") + logging.info(f"Found article link {match.group()} in {self.titre}") + try: + art: Article = Article.get(Article.id_article == match.group(2)) + text = found_replace(path_link, art, text, match) + except DoesNotExist: + text = not_found_warn(path_link, text, match) + for id_link, path_link in SECTION_LINK: + for match in id_link.finditer(text): + logging.info(f"Found section link {match.group()} in {self.titre}") + try: + section: Rubrique = Rubrique.get( + Rubrique.id_rubrique == match.group(2) ) + text = found_replace(path_link, section, text, match) + except DoesNotExist: + text = not_found_warn(path_link, text, match) else: return "" return super().convert(text, clean_html) @@ -480,9 +481,7 @@ class RootRubrique(Rubrique): # self.object_id = 0 self.profondeur = 0 - def write_tree( - self, parent_dir: str, sections_limit: int = 0, articles_limit: int = 0 - ) -> list[str | list]: + def write_tree(self, parent_dir: str) -> list[str | list]: # Define dictionary output to diplay output: list[str | list] = [] # Print starting message