refactor project structure

This commit is contained in:
Guilhem Fauré 2023-05-15 17:10:58 +02:00
parent 8eb0d1101a
commit b8f99fb329
6 changed files with 208 additions and 167 deletions

View File

@ -1,7 +1,30 @@
CONFIG = { from yaml import CLoader as Loader
"db": "spip", from yaml import load
"dbUser": "spip",
"dbPass": "password",
"outputDir": "output", class Configuration:
"maxExportNb": 1000, db = "spip"
} dbHost = "localhost"
dbUser = "spip"
dbPass = "password"
outputDir = "output"
defaultNbToExport = 1000
def __init__(self, configFile=None) -> None:
if configFile != None:
with open(configFile) as f:
config = load(f.read(), Loader=Loader)
if "db" in config:
self.db = config["db"]
if "dbUser" in config:
self.dbUser = config["dbUser"]
if "dbPass" in config:
self.dbPass = config["dbPass"]
if "outputDir" in config:
self.outputDir = config["outputDir"]
if "defaultNbToExport" in config:
self.defaultNbToExport = config["defaultNbToExport"]
# config = Configuration("spip2md.yml")
config = Configuration()

View File

@ -1,159 +1,159 @@
import re from re import I, S, compile
# SPIP syntax to Markdown # SPIP syntax to Markdown
spipToMarkdown = ( spipToMarkdown = (
( # horizontal rule ( # horizontal rule
re.compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", re.S | re.I), compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I),
# r"---", # r"---",
r"***", r"***",
), ),
( # line break ( # line break
re.compile(r"\r?\n_ *(?=\r?\n)|<br ?.*?>", re.S | re.I), compile(r"\r?\n_ *(?=\r?\n)|<br ?.*?>", S | I),
"\n", "\n",
), ),
( # heading ( # heading
re.compile(r"\{\{\{ *(.*?) *\}\}\}", re.S | re.I), compile(r"\{\{\{ *(.*?) *\}\}\}", S | I),
r"# \1", r"# \1",
# r"## \1", # r"## \1",
), ),
( # strong ( # strong
re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I), compile(r"\{\{ *(.*?) *\}\}", S | I),
r"**\1**", r"**\1**",
), ),
( # html strong ( # html strong
re.compile(r"<strong> *(.*?) *</strong>", re.S | re.I), compile(r"<strong> *(.*?) *</strong>", S | I),
r"**\1**", r"**\1**",
), ),
( # emphasis ( # emphasis
re.compile(r"\{ *(.*?) *\}", re.S | re.I), compile(r"\{ *(.*?) *\}", S | I),
r"*\1*", r"*\1*",
), ),
( # html emphasis ( # html emphasis
re.compile(r"<i> *(.*?) *<\/i>", re.S | re.I), compile(r"<i> *(.*?) *<\/i>", S | I),
r"*\1*", r"*\1*",
), ),
( # strikethrough ( # strikethrough
re.compile( compile(
r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)", r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
re.S | re.I, S | I,
), ),
r"~\1~", r"~\1~",
), ),
( # anchor ( # anchor
re.compile(r"\[ *(.*?) *-> *(.*?) *\]", re.S | re.I), compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
r"[\1](\2)", r"[\1](\2)",
), ),
( # image ( # image
re.compile(r"<(?:img|image)(.*?)(\|.*?)*>", re.S | re.I), compile(r"<(?:img|image)(.*?)(\|.*?)*>", S | I),
r"![image](\1)", r"![image](\1)",
), ),
( # document anchor ( # document anchor
re.compile(r"<(?:doc|emb)(.*?)(\|.*?)*>", re.S | re.I), compile(r"<(?:doc|emb)(.*?)(\|.*?)*>", S | I),
r"[document](\1)", r"[document](\1)",
), ),
( # wikilink ( # wikilink
re.compile(r"\[\? *(.*?) *\]", re.S | re.I), compile(r"\[\? *(.*?) *\]", S | I),
r"[\1](https://wikipedia.org/wiki/\1)", r"[\1](https://wikipedia.org/wiki/\1)",
), ),
( # footnote ( # footnote
re.compile(r"\[\[ *(.*?) *\]\]", re.S | re.I), compile(r"\[\[ *(.*?) *\]\]", S | I),
r"", r"",
), ),
( # unordered list ( # unordered list
re.compile(r"(\r?\n)-(?!#|-)\*? *", re.S | re.I), compile(r"(\r?\n)-(?!#|-)\*? *", S | I),
r"\1- ", r"\1- ",
), ),
( # wrong unordered list ( # wrong unordered list
re.compile(r"(\r?\n)\* +", re.S | re.I), compile(r"(\r?\n)\* +", S | I),
r"\1- ", r"\1- ",
), ),
( # wrong unordered list WARNING suppresses preceding tag ( # wrong unordered list WARNING suppresses preceding tag
re.compile(r"(\r?\n)<.*?>\* +", re.I), compile(r"(\r?\n)<.*?>\* +", I),
r"\1- ", r"\1- ",
), ),
( # ordered-list ( # ordered-list
re.compile(r"(\r?\n)-# *", re.S | re.I), compile(r"(\r?\n)-# *", S | I),
r"\g<1>1. ", r"\g<1>1. ",
), ),
( # table-metadata ( # table-metadata
re.compile(r"(\r?\n)\|\|(.*?)\|(.*?)\|\|", re.S | re.I), compile(r"(\r?\n)\|\|(.*?)\|(.*?)\|\|", S | I),
r"", r"",
), ),
( # quote ( # quote
re.compile( compile(
r"<(?:quote|poesie)>\s*(.*?)\s*(?:(\r?\n){2,}|<\/(?:quote|poesie)>)", r"<(?:quote|poesie)>\s*(.*?)\s*(?:(\r?\n){2,}|<\/(?:quote|poesie)>)",
re.S | re.I, S | I,
), ),
r"> \1\2\2", r"> \1\2\2",
), ),
( # box ( # box
re.compile( compile(
r"<code>\s*(.*?)\s*(?:(?:\r?\n){2,}|<\/code>)", r"<code>\s*(.*?)\s*(?:(?:\r?\n){2,}|<\/code>)",
re.S | re.I, S | I,
), ),
"`\\1`", "`\\1`",
), ),
( # fence ( # fence
re.compile( compile(
r"<cadre>\s*(.*?)\s*(?:(?:\r?\n){2,}|<\/cadre>)", r"<cadre>\s*(.*?)\s*(?:(?:\r?\n){2,}|<\/cadre>)",
re.S | re.I, S | I,
), ),
"```\n\\1\n\n```", "```\n\\1\n\n```",
), ),
( # Keep only the first language in multi-language blocks ( # Keep only the first language in multi-language blocks
re.compile( compile(
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>", r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
re.S | re.I, S | I,
), ),
r"\1", r"\1",
), ),
( # WARNING remove every html tag ( # WARNING remove every html tag
re.compile(r"<\/?.*?> *", re.S | re.I), compile(r"<\/?.*?> *", S | I),
r"", r"",
), ),
) )
spipToMetadata = ( spipToText = (
( # strong ( # strong
re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I), compile(r"\{\{ *(.*?) *\}\}", S | I),
r"\1", r"\1",
), ),
( # html strong ( # html strong
re.compile(r"<strong> *(.*?) *</strong>", re.S | re.I), compile(r"<strong> *(.*?) *</strong>", S | I),
r"\1", r"\1",
), ),
( # emphasis ( # emphasis
re.compile(r"\{ *(.*?) *\}", re.S | re.I), compile(r"\{ *(.*?) *\}", S | I),
r"\1", r"\1",
), ),
( # html emphasis ( # html emphasis
re.compile(r"<i> *(.*?) *<\/i>", re.S | re.I), compile(r"<i> *(.*?) *<\/i>", S | I),
r"\1", r"\1",
), ),
( # strikethrough ( # strikethrough
re.compile( compile(
r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)", r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
re.S | re.I, S | I,
), ),
r"\1", r"\1",
), ),
( # Keep only the first language in multi-language blocks ( # Keep only the first language in multi-language blocks
re.compile( compile(
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>", r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
re.S | re.I, S | I,
), ),
r"\1", r"\1",
), ),
( # remove every html tag ( # remove every html tag
re.compile(r"<\/?.*?> *", re.S | re.I), compile(r"<\/?.*?> *", S | I),
r"", r"",
), ),
( # beginning with angle bracket(s) ( # beginning with angle bracket(s)
re.compile(r"^>+ +", re.S | re.I), compile(r"^>+ +", S | I),
r"", r"",
), ),
( # beginning with a number followed by a dot ( # beginning with a number followed by a dot
re.compile(r"^\d+\. +", re.S | re.I), compile(r"^\d+\. +", S | I),
r"", r"",
), ),
) )
@ -161,117 +161,119 @@ spipToMetadata = (
isoToUtf = ( isoToUtf = (
# Broken encoding # Broken encoding
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1 ( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
re.compile("’"), compile("’"),
r"", r"",
), ),
( # Fix UTF-8 † that was interpreted as ISO 8859-1 ( # Fix UTF-8 † that was interpreted as ISO 8859-1
re.compile("‘"), compile("‘"),
r"", r"",
), ),
( # Fix UTF-8 é that was interpreted as ISO 8859-1 ( # Fix UTF-8 é that was interpreted as ISO 8859-1
re.compile("\u0081"), compile("\u0081"),
r"é", r"é",
), ),
( # Fix UTF-8 è that was interpreted as ISO 8859-1 ( # Fix UTF-8 è that was interpreted as ISO 8859-1
re.compile("è"), compile("è"),
r"è", r"è",
), ),
( # Fix UTF-8 ê that was interpreted as ISO 8859-1 ( # Fix UTF-8 ê that was interpreted as ISO 8859-1
re.compile(""), compile(""),
r"ê", r"ê",
), ),
( # Fix UTF-8 ê that was interpreted as ISO 8859-1 ( # Fix UTF-8 ê that was interpreted as ISO 8859-1
re.compile(""), compile(""),
r"ô", r"ô",
), ),
( # Fix UTF-8 î that was interpreted as ISO 8859-1 ( # Fix UTF-8 î that was interpreted as ISO 8859-1
re.compile(""), compile(""),
r"î", r"î",
), ),
( # Fix UTF-8 ï that was interpreted as ISO 8859-1 ( # Fix UTF-8 ï that was interpreted as ISO 8859-1
re.compile("ˆ"), compile("ˆ"),
r"ï", r"ï",
), ),
( # Fix UTF-8 ö that was interpreted as ISO 8859-1 ( # Fix UTF-8 ö that was interpreted as ISO 8859-1
re.compile("ˆ"), compile("ˆ"),
r"ö", r"ö",
), ),
( # Fix UTF-8 ö that was interpreted as ISO 8859-1 ( # Fix UTF-8 ö that was interpreted as ISO 8859-1
re.compile("ˆ"), compile("ˆ"),
r"ü", r"ü",
), ),
( # Fix UTF-8 é that was interpreted as ISO 8859-1 ( # Fix UTF-8 é that was interpreted as ISO 8859-1
re.compile("à"), compile("à"),
r"à", r"à",
), ),
( # Fix UTF-8 … that was interpreted as ISO 8859-1 ( # Fix UTF-8 … that was interpreted as ISO 8859-1
re.compile("…"), compile("…"),
r"", r"",
), ),
( # Fix UTF-8 “ that was interpreted as ISO 8859-1 ( # Fix UTF-8 “ that was interpreted as ISO 8859-1
re.compile("“"), compile("“"),
r"", r"",
), ),
( # Fix UTF-8 ” that was interpreted as ISO 8859-1 ( # Fix UTF-8 ” that was interpreted as ISO 8859-1
re.compile("â€\u009d"), compile("â€\u009d"),
r"", r"",
), ),
( # Fix UTF-8 that was interpreted as ISO 8859-1 ( # Fix UTF-8 that was interpreted as ISO 8859-1
re.compile("–"), compile("–"),
r"", r"",
), ),
( # Fix UTF-8 that was interpreted as ISO 8859-1 ( # Fix UTF-8 that was interpreted as ISO 8859-1
re.compile("—"), compile("—"),
r"", r"",
), ),
( # Fix UTF-8 that was interpreted as ISO 8859-1 ( # Fix UTF-8 that was interpreted as ISO 8859-1
re.compile("â€\u0090"), compile("â€\u0090"),
r"", r"",
), ),
( # Fix UTF-8 • that was interpreted as ISO 8859-1 ( # Fix UTF-8 • that was interpreted as ISO 8859-1
re.compile("•"), compile("•"),
r"", r"",
), ),
( # Fix UTF-8 ç that was interpreted as ISO 8859-1 ( # Fix UTF-8 ç that was interpreted as ISO 8859-1
re.compile("ç"), compile("ç"),
r"ç", r"ç",
), ),
( # Fix UTF-8 í that was interpreted as ISO 8859-1 ( # Fix UTF-8 í that was interpreted as ISO 8859-1
re.compile("\u0081"), compile("\u0081"),
r"í", r"í",
), ),
# WARNING not sure # WARNING not sure
( # Fix UTF-8 é that was interpreted as ISO 8859-1 ( # Fix UTF-8 é that was interpreted as ISO 8859-1
re.compile(""), compile(""),
r"é", r"é",
), ),
( # Fix UTF-8 † that was interpreted as ISO 8859-1 ( # Fix UTF-8 † that was interpreted as ISO 8859-1
re.compile("†"), compile("†"),
r"", r"",
), ),
) )
## WARNING unknown broken encoding ## WARNING unknown broken encoding
unknownIso = (re.compile(r"\w*
.*\r?\n"),) # unknown 
 + surroundings unknownIso = (compile(r"\w*
.*\r?\n"),) # unknown 
 + surroundings
def convert(markup): def convertBody(spipBody):
text = spipBody
for spip, markdown in spipToMarkdown: for spip, markdown in spipToMarkdown:
markup = spip.sub(markdown, markup) text = spip.sub(markdown, text)
for iso, utf in isoToUtf: for iso, utf in isoToUtf:
markup = iso.sub(utf, markup) text = iso.sub(utf, text)
for iso in unknownIso: for iso in unknownIso:
for match in iso.finditer(markup): for match in iso.finditer(text):
print(f" UNKNOWN CHARACTER {match.group()}") print(f" UNKNOWN CHARACTER {match.group()}")
return markup return text
def convertMeta(markup): def convertMeta(spipMeta):
for spip, metadata in spipToMetadata: text = spipMeta
markup = spip.sub(metadata, markup) for spip, metadata in spipToText:
text = spip.sub(metadata, text)
for iso, utf in isoToUtf: for iso, utf in isoToUtf:
markup = iso.sub(utf, markup) text = iso.sub(utf, text)
for iso in unknownIso: for iso in unknownIso:
for match in iso.finditer(markup): for match in iso.finditer(text):
print(f" UNKNOWN CHARACTER {match.group()}") print(f" UNKNOWN CHARACTER {match.group()}")
return markup return text

View File

@ -1,21 +1,13 @@
from config import CONFIG from peewee import (SQL, BigAutoField, BigIntegerField, CharField,
from peewee import * CompositeKey, DateField, DateTimeField, FloatField,
IntegerField, Model, MySQLDatabase, TextField)
db = MySQLDatabase( # class UnknownField(object):
CONFIG["db"], # def __init__(self, *_, **__):
**{ # pass
"charset": "utf8",
"sql_mode": "PIPES_AS_CONCAT",
"use_unicode": True,
"user": CONFIG["dbUser"],
"password": CONFIG["dbPass"],
}
)
class UnknownField(object): db = MySQLDatabase(None)
def __init__(self, *_, **__):
pass
class BaseModel(Model): class BaseModel(Model):

View File

@ -1,10 +1,11 @@
import yaml from converter import convertBody, convertMeta
from convert import convertMeta from database import *
from slugify import slugify from slugify import slugify
from SpipDatabase import * # from yaml import CDumper as Dumper
from yaml import dump
class metadata: class Article:
def __init__(self, article): def __init__(self, article):
self.id = article.id_article self.id = article.id_article
# self.surtitle = article.surtitre # Probably unused # self.surtitle = article.surtitre # Probably unused
@ -13,6 +14,7 @@ class metadata:
# self.section = article.id_rubrique # TODO join # self.section = article.id_rubrique # TODO join
self.description = convertMeta(article.descriptif) self.description = convertMeta(article.descriptif)
self.caption = article.chapo # Probably unused self.caption = article.chapo # Probably unused
self.text = convertBody(article.texte) # Markdown
self.ps = article.ps # Probably unused self.ps = article.ps # Probably unused
self.publicationDate = article.date self.publicationDate = article.date
self.draft = False if article.statut == "publie" else True self.draft = False if article.statut == "publie" else True
@ -37,12 +39,15 @@ class metadata:
def get_slug(self): def get_slug(self):
return slugify(f"{self.id}-{self.title}") return slugify(f"{self.id}-{self.title}")
def get_path(self):
return self.get_slug()
def get_authors(self): def get_authors(self):
return SpipAuteursLiens.select().where(SpipAuteursLiens.id_objet == self.id) return SpipAuteursLiens.select().where(SpipAuteursLiens.id_objet == self.id)
def get_frontmatter(self): def get_frontmatter(self):
return "---\n{}---".format( return "---\n{}---".format(
yaml.dump( dump(
{ {
"lang": self.lang, "lang": self.lang,
"title": self.title, "title": self.title,
@ -78,3 +83,37 @@ class metadata:
if len(self.microblog) > 0 if len(self.microblog) > 0
else "" else ""
) )
def get_article(self):
return "{}\n{}\n{}\n{}".format(
self.get_frontmatter(),
self.get_starting(),
self.text,
self.get_ending(),
)
class Articles:
exported: int = 0
def __init__(self, maxToExport) -> None:
# Query the DB to retrieve all articles sorted by publication date
self.articles = (
SpipArticles.select().order_by(SpipArticles.date.desc()).limit(maxToExport)
)
self.toExport: int = len(self.articles)
def remaining(self):
return self.toExport - self.exported
def __iter__(self):
return self
def __next__(self):
if self.remaining() <= 0:
raise StopIteration
self.exported += 1
return (
{"exported": self.exported, "remaining": self.remaining()},
Article(self.articles[self.exported - 1]),
)

50
spip2md/main.py Executable file
View File

@ -0,0 +1,50 @@
#!python
from config import config
from database import db
from iterator import Articles
if __name__ != "__main__":
exit()
import sys
from os import mkdir
from shutil import rmtree
# Clean the output dir & create a new
rmtree(config.outputDir, True)
mkdir(config.outputDir)
# Connect to the MySQL database with Peewee ORM
db.init(config.db, host=config.dbHost, user=config.dbUser, password=config.dbPass)
db.connect()
# Define max nb of articles to export based on first CLI param
if len(sys.argv) > 1:
maxToExport = int(sys.argv[1])
else:
maxToExport = config.defaultNbToExport
# Define terminal escape sequences to stylize output
R = "\033[91m"
G = "\033[92m"
B = "\033[94m"
BOLD = "\033[1m"
UNDERLINE = "\033[4m"
RESET = "\033[0m"
# Loop among first maxToExport articles & export them
for counter, article in Articles(maxToExport):
if (counter["exported"] - 1) % 100 == 0:
print(
f"\n{BOLD}Exporting {R}{counter['remaining']+1}{RESET}"
+ f"{BOLD} SPIP articles to Markdown & YAML files{RESET}\n"
)
print(f"{BOLD}{counter['exported']}.{RESET} {article.title}")
fullPath = config.outputDir + "/" + article.get_path()
print(f"\t-> {fullPath}/index.md")
mkdir(fullPath)
with open(fullPath + "/index.md", "w") as f:
f.write(article.get_article())
# Close the database connection
db.close()

View File

@ -1,65 +0,0 @@
#!python
import sys
from os import mkdir
from shutil import rmtree
from config import CONFIG
from convert import convert
from Metadata import metadata
from SpipDatabase import *
# Clean the output dir & create a new
rmtree(CONFIG["outputDir"], True)
mkdir(CONFIG["outputDir"])
# Connect to the MySQL database with Peewee ORM
db.connect()
# Query the DB to retrieve all articles sorted by publication date
articles = SpipArticles.select().order_by(SpipArticles.date.desc())
# Query the DB to retrieve all articles sorted by modification date
# articles = SpipArticles.select().order_by(SpipArticles.date_modif.desc())
# Choose how many articles to export based on first param
if len(sys.argv) > 1:
if int(sys.argv[1]) > 0 and int(sys.argv[1]) < len(articles):
nbToExport = int(sys.argv[1])
else:
nbToExport = len(articles)
else:
if len(articles) > CONFIG["maxExportNb"]:
nbToExport = CONFIG["maxExportNb"]
else:
nbToExport = len(articles)
print(f"--- Export of {nbToExport} SPIP articles to Markdown & YAML files ---\n")
# Loop among every articles & export them in Markdown files
for exported in range(nbToExport):
if exported > 0 and exported % 10 == 0:
print(f"\n--- {nbToExport - exported} articles remaining ---\n")
article = articles[exported]
meta = metadata(article)
print(f"{exported+1}. Exporting {meta.title}")
print(f" to {meta.get_slug()}/index.md")
articleDir = "{}/{}".format(CONFIG["outputDir"], meta.get_slug())
mkdir(articleDir)
with open("{}/index.md".format(articleDir), "w") as f:
f.write(
"{}\n{}\n{}\n{}".format(
meta.get_frontmatter(),
meta.get_starting(),
convert(article.texte),
meta.get_ending(),
)
)
# Close the database connection
db.close()
# Announce the end of the script
print(
f"\n--- Exported {nbToExport} SPIP articles to ./{CONFIG['outputDir']}/*/index.md ---"
)