From b8f99fb32993e83846e30b9ad751fdab9e545d64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilhem=20Faur=C3=A9?= Date: Mon, 15 May 2023 17:10:58 +0200 Subject: [PATCH] refactor project structure --- spip2md/config.py | 37 ++++-- spip2md/{convert.py => converter.py} | 152 ++++++++++++----------- spip2md/{SpipDatabase.py => database.py} | 22 ++-- spip2md/{Metadata.py => iterator.py} | 49 +++++++- spip2md/main.py | 50 ++++++++ spip2md/spip2md.py | 65 ---------- 6 files changed, 208 insertions(+), 167 deletions(-) rename spip2md/{convert.py => converter.py} (60%) rename spip2md/{SpipDatabase.py => database.py} (98%) rename spip2md/{Metadata.py => iterator.py} (72%) create mode 100755 spip2md/main.py delete mode 100755 spip2md/spip2md.py diff --git a/spip2md/config.py b/spip2md/config.py index f108aaf..1609640 100644 --- a/spip2md/config.py +++ b/spip2md/config.py @@ -1,7 +1,30 @@ -CONFIG = { - "db": "spip", - "dbUser": "spip", - "dbPass": "password", - "outputDir": "output", - "maxExportNb": 1000, -} +from yaml import CLoader as Loader +from yaml import load + + +class Configuration: + db = "spip" + dbHost = "localhost" + dbUser = "spip" + dbPass = "password" + outputDir = "output" + defaultNbToExport = 1000 + + def __init__(self, configFile=None) -> None: + if configFile != None: + with open(configFile) as f: + config = load(f.read(), Loader=Loader) + if "db" in config: + self.db = config["db"] + if "dbUser" in config: + self.dbUser = config["dbUser"] + if "dbPass" in config: + self.dbPass = config["dbPass"] + if "outputDir" in config: + self.outputDir = config["outputDir"] + if "defaultNbToExport" in config: + self.defaultNbToExport = config["defaultNbToExport"] + + +# config = Configuration("spip2md.yml") +config = Configuration() diff --git a/spip2md/convert.py b/spip2md/converter.py similarity index 60% rename from spip2md/convert.py rename to spip2md/converter.py index 88afff0..761b3ac 100644 --- a/spip2md/convert.py +++ b/spip2md/converter.py @@ -1,159 +1,159 @@ -import re +from re import I, S, compile # SPIP syntax to Markdown spipToMarkdown = ( ( # horizontal rule - re.compile(r"- ?- ?- ?- ?[\- ]*|
", re.S | re.I), + compile(r"- ?- ?- ?- ?[\- ]*|
", S | I), # r"---", r"***", ), ( # line break - re.compile(r"\r?\n_ *(?=\r?\n)|
", re.S | re.I), + compile(r"\r?\n_ *(?=\r?\n)|
", S | I), "\n", ), ( # heading - re.compile(r"\{\{\{ *(.*?) *\}\}\}", re.S | re.I), + compile(r"\{\{\{ *(.*?) *\}\}\}", S | I), r"# \1", # r"## \1", ), ( # strong - re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I), + compile(r"\{\{ *(.*?) *\}\}", S | I), r"**\1**", ), ( # html strong - re.compile(r" *(.*?) *", re.S | re.I), + compile(r" *(.*?) *", S | I), r"**\1**", ), ( # emphasis - re.compile(r"\{ *(.*?) *\}", re.S | re.I), + compile(r"\{ *(.*?) *\}", S | I), r"*\1*", ), ( # html emphasis - re.compile(r" *(.*?) *<\/i>", re.S | re.I), + compile(r" *(.*?) *<\/i>", S | I), r"*\1*", ), ( # strikethrough - re.compile( + compile( r"\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)", - re.S | re.I, + S | I, ), r"~\1~", ), ( # anchor - re.compile(r"\[ *(.*?) *-> *(.*?) *\]", re.S | re.I), + compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I), r"[\1](\2)", ), ( # image - re.compile(r"<(?:img|image)(.*?)(\|.*?)*>", re.S | re.I), + compile(r"<(?:img|image)(.*?)(\|.*?)*>", S | I), r"![image](\1)", ), ( # document anchor - re.compile(r"<(?:doc|emb)(.*?)(\|.*?)*>", re.S | re.I), + compile(r"<(?:doc|emb)(.*?)(\|.*?)*>", S | I), r"[document](\1)", ), ( # wikilink - re.compile(r"\[\? *(.*?) *\]", re.S | re.I), + compile(r"\[\? *(.*?) *\]", S | I), r"[\1](https://wikipedia.org/wiki/\1)", ), ( # footnote - re.compile(r"\[\[ *(.*?) *\]\]", re.S | re.I), + compile(r"\[\[ *(.*?) *\]\]", S | I), r"", ), ( # unordered list - re.compile(r"(\r?\n)-(?!#|-)\*? *", re.S | re.I), + compile(r"(\r?\n)-(?!#|-)\*? *", S | I), r"\1- ", ), ( # wrong unordered list - re.compile(r"(\r?\n)\* +", re.S | re.I), + compile(r"(\r?\n)\* +", S | I), r"\1- ", ), ( # wrong unordered list WARNING suppresses preceding tag - re.compile(r"(\r?\n)<.*?>\* +", re.I), + compile(r"(\r?\n)<.*?>\* +", I), r"\1- ", ), ( # ordered-list - re.compile(r"(\r?\n)-# *", re.S | re.I), + compile(r"(\r?\n)-# *", S | I), r"\g<1>1. ", ), ( # table-metadata - re.compile(r"(\r?\n)\|\|(.*?)\|(.*?)\|\|", re.S | re.I), + compile(r"(\r?\n)\|\|(.*?)\|(.*?)\|\|", S | I), r"", ), ( # quote - re.compile( + compile( r"<(?:quote|poesie)>\s*(.*?)\s*(?:(\r?\n){2,}|<\/(?:quote|poesie)>)", - re.S | re.I, + S | I, ), r"> \1\2\2", ), ( # box - re.compile( + compile( r"\s*(.*?)\s*(?:(?:\r?\n){2,}|<\/code>)", - re.S | re.I, + S | I, ), "`\\1`", ), ( # fence - re.compile( + compile( r"\s*(.*?)\s*(?:(?:\r?\n){2,}|<\/cadre>)", - re.S | re.I, + S | I, ), "```\n\\1\n\n```", ), ( # Keep only the first language in multi-language blocks - re.compile( + compile( r"\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>", - re.S | re.I, + S | I, ), r"\1", ), ( # WARNING remove every html tag - re.compile(r"<\/?.*?> *", re.S | re.I), + compile(r"<\/?.*?> *", S | I), r"", ), ) -spipToMetadata = ( +spipToText = ( ( # strong - re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I), + compile(r"\{\{ *(.*?) *\}\}", S | I), r"\1", ), ( # html strong - re.compile(r" *(.*?) *", re.S | re.I), + compile(r" *(.*?) *", S | I), r"\1", ), ( # emphasis - re.compile(r"\{ *(.*?) *\}", re.S | re.I), + compile(r"\{ *(.*?) *\}", S | I), r"\1", ), ( # html emphasis - re.compile(r" *(.*?) *<\/i>", re.S | re.I), + compile(r" *(.*?) *<\/i>", S | I), r"\1", ), ( # strikethrough - re.compile( + compile( r"\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)", - re.S | re.I, + S | I, ), r"\1", ), ( # Keep only the first language in multi-language blocks - re.compile( + compile( r"\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>", - re.S | re.I, + S | I, ), r"\1", ), ( # remove every html tag - re.compile(r"<\/?.*?> *", re.S | re.I), + compile(r"<\/?.*?> *", S | I), r"", ), ( # beginning with angle bracket(s) - re.compile(r"^>+ +", re.S | re.I), + compile(r"^>+ +", S | I), r"", ), ( # beginning with a number followed by a dot - re.compile(r"^\d+\. +", re.S | re.I), + compile(r"^\d+\. +", S | I), r"", ), ) @@ -161,117 +161,119 @@ spipToMetadata = ( isoToUtf = ( # Broken encoding ( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1 - re.compile("’"), + compile("’"), r"’", ), ( # Fix UTF-8 † that was interpreted as ISO 8859-1 - re.compile("‘"), + compile("‘"), r"‘", ), ( # Fix UTF-8 é that was interpreted as ISO 8859-1 - re.compile("eÌ\u0081"), + compile("eÌ\u0081"), r"é", ), ( # Fix UTF-8 è that was interpreted as ISO 8859-1 - re.compile("eÌ€"), + compile("eÌ€"), r"è", ), ( # Fix UTF-8 ê that was interpreted as ISO 8859-1 - re.compile("eÌ‚"), + compile("eÌ‚"), r"ê", ), ( # Fix UTF-8 ê that was interpreted as ISO 8859-1 - re.compile("oÌ‚"), + compile("oÌ‚"), r"ô", ), ( # Fix UTF-8 î that was interpreted as ISO 8859-1 - re.compile("iÌ‚"), + compile("iÌ‚"), r"î", ), ( # Fix UTF-8 ï that was interpreted as ISO 8859-1 - re.compile("ï"), + compile("ï"), r"ï", ), ( # Fix UTF-8 ö that was interpreted as ISO 8859-1 - re.compile("ö"), + compile("ö"), r"ö", ), ( # Fix UTF-8 ö that was interpreted as ISO 8859-1 - re.compile("ü"), + compile("ü"), r"ü", ), ( # Fix UTF-8 é that was interpreted as ISO 8859-1 - re.compile("aÌ€"), + compile("aÌ€"), r"à", ), ( # Fix UTF-8 … that was interpreted as ISO 8859-1 - re.compile("…"), + compile("…"), r"…", ), ( # Fix UTF-8 “ that was interpreted as ISO 8859-1 - re.compile("“"), + compile("“"), r"“", ), ( # Fix UTF-8 ” that was interpreted as ISO 8859-1 - re.compile("â€\u009d"), + compile("â€\u009d"), r"”", ), ( # Fix UTF-8 – that was interpreted as ISO 8859-1 - re.compile("–"), + compile("–"), r"–", ), ( # Fix UTF-8 – that was interpreted as ISO 8859-1 - re.compile("—"), + compile("—"), r"—", ), ( # Fix UTF-8 − that was interpreted as ISO 8859-1 - re.compile("â€\u0090"), + compile("â€\u0090"), r"−", ), ( # Fix UTF-8 • that was interpreted as ISO 8859-1 - re.compile("•"), + compile("•"), r"•", ), ( # Fix UTF-8 ç that was interpreted as ISO 8859-1 - re.compile("ç"), + compile("ç"), r"ç", ), ( # Fix UTF-8 í that was interpreted as ISO 8859-1 - re.compile("iÌ\u0081"), + compile("iÌ\u0081"), r"í", ), # WARNING not sure ( # Fix UTF-8 é that was interpreted as ISO 8859-1 - re.compile("eÌ "), + compile("eÌ "), r"é", ), ( # Fix UTF-8 † that was interpreted as ISO 8859-1 - re.compile("†"), + compile("†"), r"† ", ), ) ## WARNING unknown broken encoding -unknownIso = (re.compile(r"\w*
.*\r?\n"),) # unknown 
 + surroundings +unknownIso = (compile(r"\w*
.*\r?\n"),) # unknown 
 + surroundings -def convert(markup): +def convertBody(spipBody): + text = spipBody for spip, markdown in spipToMarkdown: - markup = spip.sub(markdown, markup) + text = spip.sub(markdown, text) for iso, utf in isoToUtf: - markup = iso.sub(utf, markup) + text = iso.sub(utf, text) for iso in unknownIso: - for match in iso.finditer(markup): + for match in iso.finditer(text): print(f" UNKNOWN CHARACTER {match.group()}") - return markup + return text -def convertMeta(markup): - for spip, metadata in spipToMetadata: - markup = spip.sub(metadata, markup) +def convertMeta(spipMeta): + text = spipMeta + for spip, metadata in spipToText: + text = spip.sub(metadata, text) for iso, utf in isoToUtf: - markup = iso.sub(utf, markup) + text = iso.sub(utf, text) for iso in unknownIso: - for match in iso.finditer(markup): + for match in iso.finditer(text): print(f" UNKNOWN CHARACTER {match.group()}") - return markup + return text diff --git a/spip2md/SpipDatabase.py b/spip2md/database.py similarity index 98% rename from spip2md/SpipDatabase.py rename to spip2md/database.py index 85601d9..20ce884 100644 --- a/spip2md/SpipDatabase.py +++ b/spip2md/database.py @@ -1,21 +1,13 @@ -from config import CONFIG -from peewee import * +from peewee import (SQL, BigAutoField, BigIntegerField, CharField, + CompositeKey, DateField, DateTimeField, FloatField, + IntegerField, Model, MySQLDatabase, TextField) -db = MySQLDatabase( - CONFIG["db"], - **{ - "charset": "utf8", - "sql_mode": "PIPES_AS_CONCAT", - "use_unicode": True, - "user": CONFIG["dbUser"], - "password": CONFIG["dbPass"], - } -) +# class UnknownField(object): +# def __init__(self, *_, **__): +# pass -class UnknownField(object): - def __init__(self, *_, **__): - pass +db = MySQLDatabase(None) class BaseModel(Model): diff --git a/spip2md/Metadata.py b/spip2md/iterator.py similarity index 72% rename from spip2md/Metadata.py rename to spip2md/iterator.py index 348b454..b584241 100644 --- a/spip2md/Metadata.py +++ b/spip2md/iterator.py @@ -1,10 +1,11 @@ -import yaml -from convert import convertMeta +from converter import convertBody, convertMeta +from database import * from slugify import slugify -from SpipDatabase import * +# from yaml import CDumper as Dumper +from yaml import dump -class metadata: +class Article: def __init__(self, article): self.id = article.id_article # self.surtitle = article.surtitre # Probably unused @@ -13,6 +14,7 @@ class metadata: # self.section = article.id_rubrique # TODO join self.description = convertMeta(article.descriptif) self.caption = article.chapo # Probably unused + self.text = convertBody(article.texte) # Markdown self.ps = article.ps # Probably unused self.publicationDate = article.date self.draft = False if article.statut == "publie" else True @@ -36,13 +38,16 @@ class metadata: def get_slug(self): return slugify(f"{self.id}-{self.title}") + + def get_path(self): + return self.get_slug() def get_authors(self): return SpipAuteursLiens.select().where(SpipAuteursLiens.id_objet == self.id) def get_frontmatter(self): return "---\n{}---".format( - yaml.dump( + dump( { "lang": self.lang, "title": self.title, @@ -78,3 +83,37 @@ class metadata: if len(self.microblog) > 0 else "" ) + + def get_article(self): + return "{}\n{}\n{}\n{}".format( + self.get_frontmatter(), + self.get_starting(), + self.text, + self.get_ending(), + ) + + +class Articles: + exported: int = 0 + + def __init__(self, maxToExport) -> None: + # Query the DB to retrieve all articles sorted by publication date + self.articles = ( + SpipArticles.select().order_by(SpipArticles.date.desc()).limit(maxToExport) + ) + self.toExport: int = len(self.articles) + + def remaining(self): + return self.toExport - self.exported + + def __iter__(self): + return self + + def __next__(self): + if self.remaining() <= 0: + raise StopIteration + self.exported += 1 + return ( + {"exported": self.exported, "remaining": self.remaining()}, + Article(self.articles[self.exported - 1]), + ) diff --git a/spip2md/main.py b/spip2md/main.py new file mode 100755 index 0000000..eade914 --- /dev/null +++ b/spip2md/main.py @@ -0,0 +1,50 @@ +#!python +from config import config +from database import db +from iterator import Articles + +if __name__ != "__main__": + exit() + +import sys +from os import mkdir +from shutil import rmtree + +# Clean the output dir & create a new +rmtree(config.outputDir, True) +mkdir(config.outputDir) + +# Connect to the MySQL database with Peewee ORM +db.init(config.db, host=config.dbHost, user=config.dbUser, password=config.dbPass) +db.connect() + +# Define max nb of articles to export based on first CLI param +if len(sys.argv) > 1: + maxToExport = int(sys.argv[1]) +else: + maxToExport = config.defaultNbToExport + +# Define terminal escape sequences to stylize output +R = "\033[91m" +G = "\033[92m" +B = "\033[94m" +BOLD = "\033[1m" +UNDERLINE = "\033[4m" +RESET = "\033[0m" + +# Loop among first maxToExport articles & export them +for counter, article in Articles(maxToExport): + if (counter["exported"] - 1) % 100 == 0: + print( + f"\n{BOLD}Exporting {R}{counter['remaining']+1}{RESET}" + + f"{BOLD} SPIP articles to Markdown & YAML files{RESET}\n" + ) + print(f"{BOLD}{counter['exported']}.{RESET} {article.title}") + fullPath = config.outputDir + "/" + article.get_path() + print(f"\t-> {fullPath}/index.md") + mkdir(fullPath) + with open(fullPath + "/index.md", "w") as f: + f.write(article.get_article()) + +# Close the database connection +db.close() diff --git a/spip2md/spip2md.py b/spip2md/spip2md.py deleted file mode 100755 index 5ac3d43..0000000 --- a/spip2md/spip2md.py +++ /dev/null @@ -1,65 +0,0 @@ -#!python -import sys -from os import mkdir -from shutil import rmtree - -from config import CONFIG -from convert import convert -from Metadata import metadata -from SpipDatabase import * - -# Clean the output dir & create a new -rmtree(CONFIG["outputDir"], True) -mkdir(CONFIG["outputDir"]) - -# Connect to the MySQL database with Peewee ORM -db.connect() - -# Query the DB to retrieve all articles sorted by publication date -articles = SpipArticles.select().order_by(SpipArticles.date.desc()) -# Query the DB to retrieve all articles sorted by modification date -# articles = SpipArticles.select().order_by(SpipArticles.date_modif.desc()) - -# Choose how many articles to export based on first param -if len(sys.argv) > 1: - if int(sys.argv[1]) > 0 and int(sys.argv[1]) < len(articles): - nbToExport = int(sys.argv[1]) - else: - nbToExport = len(articles) -else: - if len(articles) > CONFIG["maxExportNb"]: - nbToExport = CONFIG["maxExportNb"] - else: - nbToExport = len(articles) - -print(f"--- Export of {nbToExport} SPIP articles to Markdown & YAML files ---\n") - -# Loop among every articles & export them in Markdown files -for exported in range(nbToExport): - if exported > 0 and exported % 10 == 0: - print(f"\n--- {nbToExport - exported} articles remaining ---\n") - article = articles[exported] - meta = metadata(article) - - print(f"{exported+1}. Exporting {meta.title}") - print(f" to {meta.get_slug()}/index.md") - articleDir = "{}/{}".format(CONFIG["outputDir"], meta.get_slug()) - - mkdir(articleDir) - with open("{}/index.md".format(articleDir), "w") as f: - f.write( - "{}\n{}\n{}\n{}".format( - meta.get_frontmatter(), - meta.get_starting(), - convert(article.texte), - meta.get_ending(), - ) - ) - -# Close the database connection -db.close() - -# Announce the end of the script -print( - f"\n--- Exported {nbToExport} SPIP articles to ./{CONFIG['outputDir']}/*/index.md ---" -)