sections directories, better unknown char messages

This commit is contained in:
Guilhem Fauré 2023-05-16 13:29:59 +02:00
parent caf9db541f
commit f23073ef12
3 changed files with 60 additions and 38 deletions

View File

@ -13,7 +13,7 @@ class Article:
# self.surtitle = article.surtitre # Probably unused
self.title = convertMeta(article.titre)
self.subtitle = article.soustitre # Probably unused
# self.section = article.id_rubrique # TODO join
self.section_id = article.id_rubrique
self.description = convertMeta(article.descriptif)
self.caption = article.chapo # Probably unused
self.text = convertBody(article.texte) # Markdown
@ -38,14 +38,30 @@ class Article:
self.virtual = article.virtuel # TODO Why?
self.microblog = article.microblog # Probably unused
def getSlug(self):
return slugify(f"{self.id}-{self.title}")
def getSection(self):
return convertMeta(
SpipRubriques.select()
.where(SpipRubriques.id_rubrique == self.section_id)[0]
.titre
)
def getPath(self):
return self.getSlug()
def getPath(self) -> str:
return (
slugify(self.getSection()) + "/" + slugify(f"{self.id}-{self.title}") + "/"
)
def getFilename(self) -> str:
return "index.fr.md"
def getAuthors(self):
return SpipAuteursLiens.select().where(SpipAuteursLiens.id_objet == self.id)
return (
SpipAuteurs.select()
.join(
SpipAuteursLiens,
on=(SpipAuteurs.id_auteur == SpipAuteursLiens.id_auteur),
)
.where(SpipAuteursLiens.id_objet == self.id)
)
def getFrontmatter(self):
return dump(
@ -58,7 +74,7 @@ class Article:
"lastmod": self.update,
"draft": self.draft,
"description": self.description,
"authors": [author.id_auteur for author in self.getAuthors()],
"authors": [author.nom for author in self.getAuthors()],
},
allow_unicode=True,
)
@ -83,7 +99,7 @@ class Article:
article += "\n\n# MICROBLOGGING\n\n" + self.microblog
return article
def getUnknownChars(self):
def getUnknownChars(self) -> list:
errors: list = []
for text in (self.title, self.text):
for char in unknownIso:

View File

@ -1,7 +1,7 @@
from re import I, S, compile, finditer
# SPIP syntax to Markdown
spipToMarkdown = (
spipToMarkdown: tuple = (
( # horizontal rule
compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I),
# r"---",
@ -113,7 +113,7 @@ spipToMarkdown = (
),
)
spipToText = (
spipToText: tuple = (
( # strong
compile(r"\{\{ *(.*?) *\}\}", S | I),
r"\1",
@ -158,7 +158,7 @@ spipToText = (
),
)
isoToUtf = (
isoToUtf: tuple = (
# Broken encoding
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
"’",
@ -252,14 +252,13 @@ isoToUtf = (
)
## WARNING unknown broken encoding
unknownIso = (
r"
", # unknown 
 + surroundings
r"∆", # unknown â^† + surroundings
unknownIso: tuple = (
r"
", # unknown 

r"∆", # unknown â^†
)
def convertBody(spipBody):
text: str = spipBody
def convertBody(text: str) -> str:
for spip, markdown in spipToMarkdown:
text = spip.sub(markdown, text)
for iso, utf in isoToUtf:
@ -267,18 +266,22 @@ def convertBody(spipBody):
return text
def convertMeta(spipMeta):
text: str = spipMeta
def convertMeta(text: str) -> str:
for spip, metadata in spipToText:
text = spip.sub(metadata, text)
for iso, utf in isoToUtf:
text.replace(iso, utf)
return text
def highlightUnknownChars(text):
def removeUnknownChars(text: str) -> str:
for char in unknownIso:
text.replace(char, "")
return text
def highlightUnknownChars(text: str) -> str:
# Define terminal escape sequences to stylize output, regex escaped
COLOR = "\033[91m" + "\033[1m" # Red + Bold
RESET = "\033[0m"
COLOR: str = "\033[91m" + "\033[1m" # Red + Bold
RESET: str = "\033[0m"
# Highlight in COLOR unknown chars in text
for char in unknownIso:
for match in finditer(char, text):

View File

@ -1,14 +1,14 @@
#!python
from articles import Article, Articles
from config import config
from database import db
from articles import Articles
from converter import highlightUnknownChars
from database import db
if __name__ != "__main__":
exit()
import sys
from os import mkdir
from os import makedirs, mkdir
from shutil import rmtree
# Clean the output dir & create a new
@ -32,7 +32,8 @@ B: str = "\033[94m"
BOLD: str = "\033[1m"
RESET: str = "\033[0m"
unknownChars: dict = {}
# Articles that contains unknown chars
unknownCharsArticles: list[Article] = []
# Loop among first maxToExport articles & export them
for counter, article in Articles(maxToExport):
@ -44,24 +45,26 @@ for counter, article in Articles(maxToExport):
print(
f"{BOLD}{counter['exported']}.{RESET} " + highlightUnknownChars(article.title)
)
fullPath = config.outputDir + "/" + article.getPath()
print(f"{BOLD}>{RESET} {fullPath}/index.md")
mkdir(fullPath)
with open(fullPath + "/index.md", "w") as f:
fullPath: str = config.outputDir + "/" + article.getPath()
print(f"{BOLD}>{RESET} {fullPath}{article.getFilename()}")
makedirs(fullPath, exist_ok=True)
with open(fullPath + article.getFilename(), "w") as f:
f.write(article.getArticle())
# Store detected unknown characters
if len(article.getUnknownChars()) > 0:
unknownChars[article.title] = article.getUnknownChars()
unknownCharsArticles.append(article)
for title in unknownChars:
nb = len(unknownChars[title])
for article in unknownCharsArticles:
unknownCharsApparitions: list = article.getUnknownChars()
nb: int = len(unknownCharsApparitions)
s: str = "s" if nb > 1 else ""
print(
f"\n{BOLD}{nb} "
+ f"unknown character{'s' if nb > 1 else ''} detected in{RESET} " +
highlightUnknownChars(title)
f"\n{BOLD}{nb}{RESET} unknown character{s} "
+ f"detected in article {BOLD}{article.id}{RESET}"
+ f"\n{BOLD}·{RESET} "
+ highlightUnknownChars(article.title)
)
for text in unknownChars[title]:
for text in unknownCharsApparitions:
print(f" {BOLD}{RESET} " + highlightUnknownChars(text))
# Close the database connection
db.close()
db.close() # Close the database connection