refactor project structure

This commit is contained in:
Guilhem Fauré 2023-05-15 17:10:58 +02:00
parent 8eb0d1101a
commit b8f99fb329
6 changed files with 208 additions and 167 deletions

View File

@ -1,7 +1,30 @@
CONFIG = {
"db": "spip",
"dbUser": "spip",
"dbPass": "password",
"outputDir": "output",
"maxExportNb": 1000,
}
from yaml import CLoader as Loader
from yaml import load
class Configuration:
db = "spip"
dbHost = "localhost"
dbUser = "spip"
dbPass = "password"
outputDir = "output"
defaultNbToExport = 1000
def __init__(self, configFile=None) -> None:
if configFile != None:
with open(configFile) as f:
config = load(f.read(), Loader=Loader)
if "db" in config:
self.db = config["db"]
if "dbUser" in config:
self.dbUser = config["dbUser"]
if "dbPass" in config:
self.dbPass = config["dbPass"]
if "outputDir" in config:
self.outputDir = config["outputDir"]
if "defaultNbToExport" in config:
self.defaultNbToExport = config["defaultNbToExport"]
# config = Configuration("spip2md.yml")
config = Configuration()

View File

@ -1,159 +1,159 @@
import re
from re import I, S, compile
# SPIP syntax to Markdown
spipToMarkdown = (
( # horizontal rule
re.compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", re.S | re.I),
compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I),
# r"---",
r"***",
),
( # line break
re.compile(r"\r?\n_ *(?=\r?\n)|<br ?.*?>", re.S | re.I),
compile(r"\r?\n_ *(?=\r?\n)|<br ?.*?>", S | I),
"\n",
),
( # heading
re.compile(r"\{\{\{ *(.*?) *\}\}\}", re.S | re.I),
compile(r"\{\{\{ *(.*?) *\}\}\}", S | I),
r"# \1",
# r"## \1",
),
( # strong
re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I),
compile(r"\{\{ *(.*?) *\}\}", S | I),
r"**\1**",
),
( # html strong
re.compile(r"<strong> *(.*?) *</strong>", re.S | re.I),
compile(r"<strong> *(.*?) *</strong>", S | I),
r"**\1**",
),
( # emphasis
re.compile(r"\{ *(.*?) *\}", re.S | re.I),
compile(r"\{ *(.*?) *\}", S | I),
r"*\1*",
),
( # html emphasis
re.compile(r"<i> *(.*?) *<\/i>", re.S | re.I),
compile(r"<i> *(.*?) *<\/i>", S | I),
r"*\1*",
),
( # strikethrough
re.compile(
compile(
r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
re.S | re.I,
S | I,
),
r"~\1~",
),
( # anchor
re.compile(r"\[ *(.*?) *-> *(.*?) *\]", re.S | re.I),
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
r"[\1](\2)",
),
( # image
re.compile(r"<(?:img|image)(.*?)(\|.*?)*>", re.S | re.I),
compile(r"<(?:img|image)(.*?)(\|.*?)*>", S | I),
r"![image](\1)",
),
( # document anchor
re.compile(r"<(?:doc|emb)(.*?)(\|.*?)*>", re.S | re.I),
compile(r"<(?:doc|emb)(.*?)(\|.*?)*>", S | I),
r"[document](\1)",
),
( # wikilink
re.compile(r"\[\? *(.*?) *\]", re.S | re.I),
compile(r"\[\? *(.*?) *\]", S | I),
r"[\1](https://wikipedia.org/wiki/\1)",
),
( # footnote
re.compile(r"\[\[ *(.*?) *\]\]", re.S | re.I),
compile(r"\[\[ *(.*?) *\]\]", S | I),
r"",
),
( # unordered list
re.compile(r"(\r?\n)-(?!#|-)\*? *", re.S | re.I),
compile(r"(\r?\n)-(?!#|-)\*? *", S | I),
r"\1- ",
),
( # wrong unordered list
re.compile(r"(\r?\n)\* +", re.S | re.I),
compile(r"(\r?\n)\* +", S | I),
r"\1- ",
),
( # wrong unordered list WARNING suppresses preceding tag
re.compile(r"(\r?\n)<.*?>\* +", re.I),
compile(r"(\r?\n)<.*?>\* +", I),
r"\1- ",
),
( # ordered-list
re.compile(r"(\r?\n)-# *", re.S | re.I),
compile(r"(\r?\n)-# *", S | I),
r"\g<1>1. ",
),
( # table-metadata
re.compile(r"(\r?\n)\|\|(.*?)\|(.*?)\|\|", re.S | re.I),
compile(r"(\r?\n)\|\|(.*?)\|(.*?)\|\|", S | I),
r"",
),
( # quote
re.compile(
compile(
r"<(?:quote|poesie)>\s*(.*?)\s*(?:(\r?\n){2,}|<\/(?:quote|poesie)>)",
re.S | re.I,
S | I,
),
r"> \1\2\2",
),
( # box
re.compile(
compile(
r"<code>\s*(.*?)\s*(?:(?:\r?\n){2,}|<\/code>)",
re.S | re.I,
S | I,
),
"`\\1`",
),
( # fence
re.compile(
compile(
r"<cadre>\s*(.*?)\s*(?:(?:\r?\n){2,}|<\/cadre>)",
re.S | re.I,
S | I,
),
"```\n\\1\n\n```",
),
( # Keep only the first language in multi-language blocks
re.compile(
compile(
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
re.S | re.I,
S | I,
),
r"\1",
),
( # WARNING remove every html tag
re.compile(r"<\/?.*?> *", re.S | re.I),
compile(r"<\/?.*?> *", S | I),
r"",
),
)
spipToMetadata = (
spipToText = (
( # strong
re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I),
compile(r"\{\{ *(.*?) *\}\}", S | I),
r"\1",
),
( # html strong
re.compile(r"<strong> *(.*?) *</strong>", re.S | re.I),
compile(r"<strong> *(.*?) *</strong>", S | I),
r"\1",
),
( # emphasis
re.compile(r"\{ *(.*?) *\}", re.S | re.I),
compile(r"\{ *(.*?) *\}", S | I),
r"\1",
),
( # html emphasis
re.compile(r"<i> *(.*?) *<\/i>", re.S | re.I),
compile(r"<i> *(.*?) *<\/i>", S | I),
r"\1",
),
( # strikethrough
re.compile(
compile(
r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
re.S | re.I,
S | I,
),
r"\1",
),
( # Keep only the first language in multi-language blocks
re.compile(
compile(
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
re.S | re.I,
S | I,
),
r"\1",
),
( # remove every html tag
re.compile(r"<\/?.*?> *", re.S | re.I),
compile(r"<\/?.*?> *", S | I),
r"",
),
( # beginning with angle bracket(s)
re.compile(r"^>+ +", re.S | re.I),
compile(r"^>+ +", S | I),
r"",
),
( # beginning with a number followed by a dot
re.compile(r"^\d+\. +", re.S | re.I),
compile(r"^\d+\. +", S | I),
r"",
),
)
@ -161,117 +161,119 @@ spipToMetadata = (
isoToUtf = (
# Broken encoding
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
re.compile("’"),
compile("’"),
r"",
),
( # Fix UTF-8 † that was interpreted as ISO 8859-1
re.compile("‘"),
compile("‘"),
r"",
),
( # Fix UTF-8 é that was interpreted as ISO 8859-1
re.compile("\u0081"),
compile("\u0081"),
r"é",
),
( # Fix UTF-8 è that was interpreted as ISO 8859-1
re.compile("è"),
compile("è"),
r"è",
),
( # Fix UTF-8 ê that was interpreted as ISO 8859-1
re.compile(""),
compile(""),
r"ê",
),
( # Fix UTF-8 ê that was interpreted as ISO 8859-1
re.compile(""),
compile(""),
r"ô",
),
( # Fix UTF-8 î that was interpreted as ISO 8859-1
re.compile(""),
compile(""),
r"î",
),
( # Fix UTF-8 ï that was interpreted as ISO 8859-1
re.compile("ˆ"),
compile("ˆ"),
r"ï",
),
( # Fix UTF-8 ö that was interpreted as ISO 8859-1
re.compile("ˆ"),
compile("ˆ"),
r"ö",
),
( # Fix UTF-8 ö that was interpreted as ISO 8859-1
re.compile("ˆ"),
compile("ˆ"),
r"ü",
),
( # Fix UTF-8 é that was interpreted as ISO 8859-1
re.compile("à"),
compile("à"),
r"à",
),
( # Fix UTF-8 … that was interpreted as ISO 8859-1
re.compile("…"),
compile("…"),
r"",
),
( # Fix UTF-8 “ that was interpreted as ISO 8859-1
re.compile("“"),
compile("“"),
r"",
),
( # Fix UTF-8 ” that was interpreted as ISO 8859-1
re.compile("â€\u009d"),
compile("â€\u009d"),
r"",
),
( # Fix UTF-8 that was interpreted as ISO 8859-1
re.compile("–"),
compile("–"),
r"",
),
( # Fix UTF-8 that was interpreted as ISO 8859-1
re.compile("—"),
compile("—"),
r"",
),
( # Fix UTF-8 that was interpreted as ISO 8859-1
re.compile("â€\u0090"),
compile("â€\u0090"),
r"",
),
( # Fix UTF-8 • that was interpreted as ISO 8859-1
re.compile("•"),
compile("•"),
r"",
),
( # Fix UTF-8 ç that was interpreted as ISO 8859-1
re.compile("ç"),
compile("ç"),
r"ç",
),
( # Fix UTF-8 í that was interpreted as ISO 8859-1
re.compile("\u0081"),
compile("\u0081"),
r"í",
),
# WARNING not sure
( # Fix UTF-8 é that was interpreted as ISO 8859-1
re.compile(""),
compile(""),
r"é",
),
( # Fix UTF-8 † that was interpreted as ISO 8859-1
re.compile("†"),
compile("†"),
r"",
),
)
## WARNING unknown broken encoding
unknownIso = (re.compile(r"\w*
.*\r?\n"),) # unknown 
 + surroundings
unknownIso = (compile(r"\w*
.*\r?\n"),) # unknown 
 + surroundings
def convert(markup):
def convertBody(spipBody):
text = spipBody
for spip, markdown in spipToMarkdown:
markup = spip.sub(markdown, markup)
text = spip.sub(markdown, text)
for iso, utf in isoToUtf:
markup = iso.sub(utf, markup)
text = iso.sub(utf, text)
for iso in unknownIso:
for match in iso.finditer(markup):
for match in iso.finditer(text):
print(f" UNKNOWN CHARACTER {match.group()}")
return markup
return text
def convertMeta(markup):
for spip, metadata in spipToMetadata:
markup = spip.sub(metadata, markup)
def convertMeta(spipMeta):
text = spipMeta
for spip, metadata in spipToText:
text = spip.sub(metadata, text)
for iso, utf in isoToUtf:
markup = iso.sub(utf, markup)
text = iso.sub(utf, text)
for iso in unknownIso:
for match in iso.finditer(markup):
for match in iso.finditer(text):
print(f" UNKNOWN CHARACTER {match.group()}")
return markup
return text

View File

@ -1,21 +1,13 @@
from config import CONFIG
from peewee import *
from peewee import (SQL, BigAutoField, BigIntegerField, CharField,
CompositeKey, DateField, DateTimeField, FloatField,
IntegerField, Model, MySQLDatabase, TextField)
db = MySQLDatabase(
CONFIG["db"],
**{
"charset": "utf8",
"sql_mode": "PIPES_AS_CONCAT",
"use_unicode": True,
"user": CONFIG["dbUser"],
"password": CONFIG["dbPass"],
}
)
# class UnknownField(object):
# def __init__(self, *_, **__):
# pass
class UnknownField(object):
def __init__(self, *_, **__):
pass
db = MySQLDatabase(None)
class BaseModel(Model):

View File

@ -1,10 +1,11 @@
import yaml
from convert import convertMeta
from converter import convertBody, convertMeta
from database import *
from slugify import slugify
from SpipDatabase import *
# from yaml import CDumper as Dumper
from yaml import dump
class metadata:
class Article:
def __init__(self, article):
self.id = article.id_article
# self.surtitle = article.surtitre # Probably unused
@ -13,6 +14,7 @@ class metadata:
# self.section = article.id_rubrique # TODO join
self.description = convertMeta(article.descriptif)
self.caption = article.chapo # Probably unused
self.text = convertBody(article.texte) # Markdown
self.ps = article.ps # Probably unused
self.publicationDate = article.date
self.draft = False if article.statut == "publie" else True
@ -36,13 +38,16 @@ class metadata:
def get_slug(self):
return slugify(f"{self.id}-{self.title}")
def get_path(self):
return self.get_slug()
def get_authors(self):
return SpipAuteursLiens.select().where(SpipAuteursLiens.id_objet == self.id)
def get_frontmatter(self):
return "---\n{}---".format(
yaml.dump(
dump(
{
"lang": self.lang,
"title": self.title,
@ -78,3 +83,37 @@ class metadata:
if len(self.microblog) > 0
else ""
)
def get_article(self):
return "{}\n{}\n{}\n{}".format(
self.get_frontmatter(),
self.get_starting(),
self.text,
self.get_ending(),
)
class Articles:
exported: int = 0
def __init__(self, maxToExport) -> None:
# Query the DB to retrieve all articles sorted by publication date
self.articles = (
SpipArticles.select().order_by(SpipArticles.date.desc()).limit(maxToExport)
)
self.toExport: int = len(self.articles)
def remaining(self):
return self.toExport - self.exported
def __iter__(self):
return self
def __next__(self):
if self.remaining() <= 0:
raise StopIteration
self.exported += 1
return (
{"exported": self.exported, "remaining": self.remaining()},
Article(self.articles[self.exported - 1]),
)

50
spip2md/main.py Executable file
View File

@ -0,0 +1,50 @@
#!python
from config import config
from database import db
from iterator import Articles
if __name__ != "__main__":
exit()
import sys
from os import mkdir
from shutil import rmtree
# Clean the output dir & create a new
rmtree(config.outputDir, True)
mkdir(config.outputDir)
# Connect to the MySQL database with Peewee ORM
db.init(config.db, host=config.dbHost, user=config.dbUser, password=config.dbPass)
db.connect()
# Define max nb of articles to export based on first CLI param
if len(sys.argv) > 1:
maxToExport = int(sys.argv[1])
else:
maxToExport = config.defaultNbToExport
# Define terminal escape sequences to stylize output
R = "\033[91m"
G = "\033[92m"
B = "\033[94m"
BOLD = "\033[1m"
UNDERLINE = "\033[4m"
RESET = "\033[0m"
# Loop among first maxToExport articles & export them
for counter, article in Articles(maxToExport):
if (counter["exported"] - 1) % 100 == 0:
print(
f"\n{BOLD}Exporting {R}{counter['remaining']+1}{RESET}"
+ f"{BOLD} SPIP articles to Markdown & YAML files{RESET}\n"
)
print(f"{BOLD}{counter['exported']}.{RESET} {article.title}")
fullPath = config.outputDir + "/" + article.get_path()
print(f"\t-> {fullPath}/index.md")
mkdir(fullPath)
with open(fullPath + "/index.md", "w") as f:
f.write(article.get_article())
# Close the database connection
db.close()

View File

@ -1,65 +0,0 @@
#!python
import sys
from os import mkdir
from shutil import rmtree
from config import CONFIG
from convert import convert
from Metadata import metadata
from SpipDatabase import *
# Clean the output dir & create a new
rmtree(CONFIG["outputDir"], True)
mkdir(CONFIG["outputDir"])
# Connect to the MySQL database with Peewee ORM
db.connect()
# Query the DB to retrieve all articles sorted by publication date
articles = SpipArticles.select().order_by(SpipArticles.date.desc())
# Query the DB to retrieve all articles sorted by modification date
# articles = SpipArticles.select().order_by(SpipArticles.date_modif.desc())
# Choose how many articles to export based on first param
if len(sys.argv) > 1:
if int(sys.argv[1]) > 0 and int(sys.argv[1]) < len(articles):
nbToExport = int(sys.argv[1])
else:
nbToExport = len(articles)
else:
if len(articles) > CONFIG["maxExportNb"]:
nbToExport = CONFIG["maxExportNb"]
else:
nbToExport = len(articles)
print(f"--- Export of {nbToExport} SPIP articles to Markdown & YAML files ---\n")
# Loop among every articles & export them in Markdown files
for exported in range(nbToExport):
if exported > 0 and exported % 10 == 0:
print(f"\n--- {nbToExport - exported} articles remaining ---\n")
article = articles[exported]
meta = metadata(article)
print(f"{exported+1}. Exporting {meta.title}")
print(f" to {meta.get_slug()}/index.md")
articleDir = "{}/{}".format(CONFIG["outputDir"], meta.get_slug())
mkdir(articleDir)
with open("{}/index.md".format(articleDir), "w") as f:
f.write(
"{}\n{}\n{}\n{}".format(
meta.get_frontmatter(),
meta.get_starting(),
convert(article.texte),
meta.get_ending(),
)
)
# Close the database connection
db.close()
# Announce the end of the script
print(
f"\n--- Exported {nbToExport} SPIP articles to ./{CONFIG['outputDir']}/*/index.md ---"
)