From 32738a9269d947076421c6f1be849c55ca6aee69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilhem=20Faur=C3=A9?= Date: Fri, 26 May 2023 14:43:39 +0200 Subject: [PATCH] separation between classes, functions & regex mappings --- spip2md/__init__.py | 29 ++++-------- spip2md/regexmap.py | 104 +++++++++++++++-------------------------- spip2md/spipobjects.py | 57 ++++++++++++++++------ 3 files changed, 91 insertions(+), 99 deletions(-) diff --git a/spip2md/__init__.py b/spip2md/__init__.py index 3c50d5e..f67b322 100644 --- a/spip2md/__init__.py +++ b/spip2md/__init__.py @@ -9,11 +9,7 @@ from peewee import ModelSelect from spip2md.config import CFG from spip2md.database import DB -from spip2md.regexmap import unknown_chars, unknown_chars_context -from spip2md.spipobjects import ( - Article, - Rubrique, -) +from spip2md.spipobjects import Rubrique # Define styles BOLD = 1 # Bold @@ -60,13 +56,7 @@ def root_sections(limit: int = 10**3) -> ModelSelect: .limit(limit) ) - -def has_unknown_chars(article: Article) -> bool: - if len(unknown_chars_context(article.texte)) > 0: - return True - return False - - +r""" # Print the detected unknown chars in article in their context but highlighted def warn_unknown_chars(article: Article) -> None: # Print the title of the article in which there is unknown characters @@ -85,6 +75,7 @@ def warn_unknown_chars(article: Article) -> None: highlight(text, *unknown_chars(text)) style(" … \n") print() # Break line +""" # Print one root section list output correctly @@ -106,16 +97,16 @@ DB.connect() def main(*argv): if len(argv) == 0: argv = sys.argv - # Define max nb of articles to export based on first CLI argument + # Define max nb of sections to export based on first CLI argument TODO if len(argv) >= 2: - articles_export = int(argv[1]) - else: - articles_export = CFG.max_articles_export - # Define max nb of sections to export based on second CLI argument - if len(argv) >= 3: - sections_export = int(argv[2]) + sections_export = int(argv[1]) else: sections_export = CFG.max_sections_export + # Define max nb of articles to export based on second CLI argument TODO + # if len(argv) >= 3: + # articles_export = int(argv[2]) + # else: + # articles_export = CFG.max_articles_export # Clear the output dir & create a new if CFG.clear_output: diff --git a/spip2md/regexmap.py b/spip2md/regexmap.py index 5222286..314f0e1 100644 --- a/spip2md/regexmap.py +++ b/spip2md/regexmap.py @@ -1,10 +1,9 @@ # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré # pyright: strict -from re import I, S, compile, finditer, sub -from typing import Optional +from re import I, S, compile -# SPIP syntax to Markdown -SPIP_TO_MARKDOWN = ( +# ((SPIP syntax, Replacement Markdown syntax), …) +SPIP_MARKDOWN = ( ( # horizontal rule compile(r"- ?- ?- ?- ?[\- ]*|
", S | I), # r"---", @@ -43,15 +42,15 @@ SPIP_TO_MARKDOWN = ( ), ( # images compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I), - r"![](\1\2)", + r"![](\2)", # Needs to be further processed to replace ID with filename ), ( # documents & embeds compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I), - r"[](\1\2)", + r"[](\2)", # Needs to be further processed to replace ID with filename ), ( # internal links compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I), - r"[](\1\2)", + r"[](\2)", # Needs to be further processed to replace ID with filename ), ( # anchor compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I), @@ -106,20 +105,31 @@ SPIP_TO_MARKDOWN = ( ), "```\n\\1\n\n```", ), - ( # WARNING remove every html tag - compile(r"<\/?.*?>\s*", S | I), - r"", - ), ) -# Further cleaning for metadata texts such as titles or descriptions -SPIP_META_BLOAT = ( +# Match against documents ID found in links, ID can be inserted with .format() +# Name and path can be further replaced with .format() +DOCUMENT_LINK = r"(!)?\[(.*?)\]\(({})\)" +DOCUMENT_LINK_REPL = r"\1[\2{}]({})" + +# Multi language block, capture groups: (lang, text, lang, text, …) +MULTILANG = compile( + r"(?:\s*\[(.{2,6})\]\s*(.*?)\s*)+<\/multi>", + S | I, +) + +# WARNING probably useless text in metadata fields, to be removed +BLOAT = ( compile(r"^>+ +", S | I), # Remove beginning with angle bracket(s) compile(r"^\d+\. +", S | I), # Remove beginning with a number followed by a dot ) -# Broken ISO encoding to proper UTF-8 -ISO_TO_UTF = ( +# Matches against every HTML tag +HTMLTAG = compile(r"<\/?.*?>\s*", S | I) + + +# ((Broken ISO 8859-1 encoding, Proper UTF equivalent encoding), …) +ISO_UTF = ( ( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1 "’", r"’", @@ -224,7 +234,7 @@ ISO_TO_UTF = ( "iÌ\u0081", r"í", ), - # WARNING not sure + # WARNING not sure below ( # Fix UTF-8 é that was interpreted as ISO 8859-1 "eÌ ", r"é", @@ -239,62 +249,22 @@ ISO_TO_UTF = ( ), ) -# WARNING unknown broken encoding +# WARNING broken ISO 8859-1 encoding which I don’t know the UTF equivalent UNKNOWN_ISO = ( - r"
", - r"∆", -) - -# Multi language block, capture the first -MULTILINGUAL = compile( - r"\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>", - S | I, + "
", + "∆", ) -# Apply SPIP to Markdown & ISO to UTF conversions to a text, & eventually clean meta -def convert(text: Optional[str], clean_meta: bool = False) -> str: - if text is None: - return "" - for spip, markdown in SPIP_TO_MARKDOWN: - text = spip.sub(markdown, text) - if clean_meta: - for bloat in SPIP_META_BLOAT: - text = bloat.sub("", text) - for iso, utf in ISO_TO_UTF: - text = text.replace(iso, utf) - return text - - -# Replace images & files links in Markdown with real slugs of the actually linked files -def link_document(text: str, id: int, name: str, slug: str) -> str: - # Replace images that dont have a title written in text - text = sub( - r"!\[]\((?:img|image)" + str(id) + r"(\|.*?)*\)", - f"![{name}]({slug})", - text, - ) - # Replace images that dont have a title written in text - text = sub( - r"\[]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)", - f"[{name}]({slug})", - text, - ) - # Replace images that already had a title in Markdown style link - text = sub( - r"!\[(.+?)\]\((?:img|image)" + str(id) + r"(\|.*?)*\)", - f"![\\1]({slug})", - text, - ) - # Replace documents that already had a title in Markdown style link - text = sub( - r"\[(.+?)\]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)", - f"[\\1]({slug})", - text, - ) - return text +# Special elements in terminal output to surround +SPECIAL_OUTPUT = ( + (compile(r"^([0-9]+?\.)(?= )"), r"{}\1{}"), # Counter + (compile(r"(?<= )->(?= )"), r"{}->{}"), # Arrow + (compile(r"(?<=^Exporting )([0-9]+?)(?= )"), r"{}\1{}"), # Total +) +r""" # Return a list of tuples giving the start and end of unknown substring in text def unknown_chars(text: str) -> list[tuple[int, int]]: positions: list[tuple[int, int]] = [] @@ -303,7 +273,6 @@ def unknown_chars(text: str) -> list[tuple[int, int]]: positions.append((match.start(), match.end())) return positions - # Return strings with unknown chards found in text, surrounded by context_length chars def unknown_chars_context(text: str, context_length: int = 24) -> list[str]: errors: list[str] = [] @@ -316,3 +285,4 @@ def unknown_chars_context(text: str, context_length: int = 24) -> list[str]: for match in matches: errors.append(match.group()) return errors +""" diff --git a/spip2md/spipobjects.py b/spip2md/spipobjects.py index a2abf14..ee4ffe5 100644 --- a/spip2md/spipobjects.py +++ b/spip2md/spipobjects.py @@ -1,7 +1,7 @@ # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré from os import makedirs from os.path import basename, splitext -from re import finditer +from re import finditer, sub from shutil import copyfile from typing import Any, Optional @@ -18,7 +18,16 @@ from spip2md.database import ( SpipDocumentsLiens, SpipRubriques, ) -from spip2md.regexmap import convert, link_document, unknown_chars +from spip2md.regexmap import ( + BLOAT, + DOCUMENT_LINK, + DOCUMENT_LINK_REPL, + HTMLTAG, + ISO_UTF, + MULTILANG, + SPIP_MARKDOWN, + UNKNOWN_ISO, +) class SpipWritable: @@ -26,6 +35,7 @@ class SpipWritable: texte: str lang: str titre: str + profondeur: int def filename(self, date: bool = False) -> str: raise NotImplementedError( @@ -49,6 +59,19 @@ class SpipWritable: output[-1] += "MISSING NAME" return output + # Apply different mappings to text fields, like SPIP to Markdown or encoding + def convert_attrs(self, *attrs: str) -> None: + attrs += "titre", "descriptif" + for attr in attrs: + a = getattr(self, attr) + if len(a) > 0: + for spip, markdown in SPIP_MARKDOWN: + setattr(self, attr, spip.sub(markdown, a)) + for bloat in BLOAT: + setattr(self, attr, bloat.sub("", a)) + for iso, utf in ISO_UTF: + setattr(self, attr, a.replace(iso, utf)) + # Write object to output destination def write(self, parent_dir: str) -> str: raise NotImplementedError( @@ -69,8 +92,6 @@ class Document(SpipWritable, SpipDocuments): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.titre: str = convert(self.titre, True) - self.descriptif: str = convert(self.descriptif, True) self.statut: str = "false" if self.statut == "publie" else "true" # Get slugified name of this file @@ -86,6 +107,8 @@ class Document(SpipWritable, SpipDocuments): # Write document to output destination def write(self, parent_dir: str) -> str: + # Apply needed conversions + super().convert_attrs() # Define file source and destination src: str = CFG.data_dir + self.fichier dest: str = parent_dir + self.filename() @@ -100,23 +123,25 @@ class SpipObject(SpipWritable): date: DateTimeField maj: str id_secteur: int + descriptif: str + extra: str def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Common fields that need conversions - self.titre: str = convert(self.titre, True) - self.descriptif: str = convert(self.descriptif, True) - self.texte: str = convert(self.texte) # Convert SPIP to Markdown self.statut: str = "false" if self.statut == "publie" else "true" self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true" - self.extra: str = convert(self.extra) # Probably unused # Define file prefix (needs to be redefined for sections) self.prefix = "index" # Convert SPIP style internal links for images & other files into Markdown style def link_documents(self, documents: ModelSelect) -> None: for d in documents: - self.texte = link_document(self.texte, d.id_document, d.titre, d.filename()) + self.texte = sub( + DOCUMENT_LINK.format(d.id_document), + DOCUMENT_LINK_REPL.format(d.titre, d.filename()), + self.texte, + ) # Output related documents & link them in the text by the way def documents(self, link_documents: bool = True) -> ModelSelect: @@ -198,8 +223,13 @@ class SpipObject(SpipWritable): body += "\n\n# EXTRA\n\n" + self.extra return body + def convert_attrs(self, *attrs: str) -> None: + return super().convert_attrs(*attrs, "descriptif", "extra") + # Write object to output destination def write(self, parent_dir: str) -> str: + # Apply needed conversions + super().convert_attrs() # Define actual export directory directory: str = parent_dir + self.dir_slug() # Make a directory for this object if there isn’t @@ -219,14 +249,15 @@ class Article(SpipObject, SpipArticles): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # More conversions needed for articles - self.surtitre: str = convert(self.surtitre, True) # Probably unused - self.soustitre: str = convert(self.soustitre, True) # Probably unused - self.chapo: str = convert(self.chapo) # Probably unused - self.ps: str = convert(self.ps) # Probably unused self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false" # ID self.object_id = self.id_article + def convert_attrs(self, *attrs: str) -> None: + return super().convert_attrs( + *attrs, "surtitre", "soustitre", "chapo", "ps", "accepter_forum" + ) + def frontmatter(self, append: Optional[dict[str, Any]] = None) -> str: meta: dict[str, Any] = { # Article specific