sections directories, better unknown char messages

This commit is contained in:
Guilhem Fauré 2023-05-16 13:29:59 +02:00
parent caf9db541f
commit f23073ef12
3 changed files with 60 additions and 38 deletions

View File

@ -13,7 +13,7 @@ class Article:
# self.surtitle = article.surtitre # Probably unused # self.surtitle = article.surtitre # Probably unused
self.title = convertMeta(article.titre) self.title = convertMeta(article.titre)
self.subtitle = article.soustitre # Probably unused self.subtitle = article.soustitre # Probably unused
# self.section = article.id_rubrique # TODO join self.section_id = article.id_rubrique
self.description = convertMeta(article.descriptif) self.description = convertMeta(article.descriptif)
self.caption = article.chapo # Probably unused self.caption = article.chapo # Probably unused
self.text = convertBody(article.texte) # Markdown self.text = convertBody(article.texte) # Markdown
@ -38,14 +38,30 @@ class Article:
self.virtual = article.virtuel # TODO Why? self.virtual = article.virtuel # TODO Why?
self.microblog = article.microblog # Probably unused self.microblog = article.microblog # Probably unused
def getSlug(self): def getSection(self):
return slugify(f"{self.id}-{self.title}") return convertMeta(
SpipRubriques.select()
.where(SpipRubriques.id_rubrique == self.section_id)[0]
.titre
)
def getPath(self): def getPath(self) -> str:
return self.getSlug() return (
slugify(self.getSection()) + "/" + slugify(f"{self.id}-{self.title}") + "/"
)
def getFilename(self) -> str:
return "index.fr.md"
def getAuthors(self): def getAuthors(self):
return SpipAuteursLiens.select().where(SpipAuteursLiens.id_objet == self.id) return (
SpipAuteurs.select()
.join(
SpipAuteursLiens,
on=(SpipAuteurs.id_auteur == SpipAuteursLiens.id_auteur),
)
.where(SpipAuteursLiens.id_objet == self.id)
)
def getFrontmatter(self): def getFrontmatter(self):
return dump( return dump(
@ -58,7 +74,7 @@ class Article:
"lastmod": self.update, "lastmod": self.update,
"draft": self.draft, "draft": self.draft,
"description": self.description, "description": self.description,
"authors": [author.id_auteur for author in self.getAuthors()], "authors": [author.nom for author in self.getAuthors()],
}, },
allow_unicode=True, allow_unicode=True,
) )
@ -83,7 +99,7 @@ class Article:
article += "\n\n# MICROBLOGGING\n\n" + self.microblog article += "\n\n# MICROBLOGGING\n\n" + self.microblog
return article return article
def getUnknownChars(self): def getUnknownChars(self) -> list:
errors: list = [] errors: list = []
for text in (self.title, self.text): for text in (self.title, self.text):
for char in unknownIso: for char in unknownIso:

View File

@ -1,7 +1,7 @@
from re import I, S, compile, finditer from re import I, S, compile, finditer
# SPIP syntax to Markdown # SPIP syntax to Markdown
spipToMarkdown = ( spipToMarkdown: tuple = (
( # horizontal rule ( # horizontal rule
compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I), compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I),
# r"---", # r"---",
@ -113,7 +113,7 @@ spipToMarkdown = (
), ),
) )
spipToText = ( spipToText: tuple = (
( # strong ( # strong
compile(r"\{\{ *(.*?) *\}\}", S | I), compile(r"\{\{ *(.*?) *\}\}", S | I),
r"\1", r"\1",
@ -158,7 +158,7 @@ spipToText = (
), ),
) )
isoToUtf = ( isoToUtf: tuple = (
# Broken encoding # Broken encoding
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1 ( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
"’", "’",
@ -252,14 +252,13 @@ isoToUtf = (
) )
## WARNING unknown broken encoding ## WARNING unknown broken encoding
unknownIso = ( unknownIso: tuple = (
r"
", # unknown 
 + surroundings r"
", # unknown 

r"∆", # unknown â^† + surroundings r"∆", # unknown â^†
) )
def convertBody(spipBody): def convertBody(text: str) -> str:
text: str = spipBody
for spip, markdown in spipToMarkdown: for spip, markdown in spipToMarkdown:
text = spip.sub(markdown, text) text = spip.sub(markdown, text)
for iso, utf in isoToUtf: for iso, utf in isoToUtf:
@ -267,18 +266,22 @@ def convertBody(spipBody):
return text return text
def convertMeta(spipMeta): def convertMeta(text: str) -> str:
text: str = spipMeta
for spip, metadata in spipToText: for spip, metadata in spipToText:
text = spip.sub(metadata, text) text = spip.sub(metadata, text)
for iso, utf in isoToUtf: for iso, utf in isoToUtf:
text.replace(iso, utf) text.replace(iso, utf)
return text return text
def highlightUnknownChars(text): def removeUnknownChars(text: str) -> str:
for char in unknownIso:
text.replace(char, "")
return text
def highlightUnknownChars(text: str) -> str:
# Define terminal escape sequences to stylize output, regex escaped # Define terminal escape sequences to stylize output, regex escaped
COLOR = "\033[91m" + "\033[1m" # Red + Bold COLOR: str = "\033[91m" + "\033[1m" # Red + Bold
RESET = "\033[0m" RESET: str = "\033[0m"
# Highlight in COLOR unknown chars in text # Highlight in COLOR unknown chars in text
for char in unknownIso: for char in unknownIso:
for match in finditer(char, text): for match in finditer(char, text):

View File

@ -1,14 +1,14 @@
#!python #!python
from articles import Article, Articles
from config import config from config import config
from database import db
from articles import Articles
from converter import highlightUnknownChars from converter import highlightUnknownChars
from database import db
if __name__ != "__main__": if __name__ != "__main__":
exit() exit()
import sys import sys
from os import mkdir from os import makedirs, mkdir
from shutil import rmtree from shutil import rmtree
# Clean the output dir & create a new # Clean the output dir & create a new
@ -32,7 +32,8 @@ B: str = "\033[94m"
BOLD: str = "\033[1m" BOLD: str = "\033[1m"
RESET: str = "\033[0m" RESET: str = "\033[0m"
unknownChars: dict = {} # Articles that contains unknown chars
unknownCharsArticles: list[Article] = []
# Loop among first maxToExport articles & export them # Loop among first maxToExport articles & export them
for counter, article in Articles(maxToExport): for counter, article in Articles(maxToExport):
@ -44,24 +45,26 @@ for counter, article in Articles(maxToExport):
print( print(
f"{BOLD}{counter['exported']}.{RESET} " + highlightUnknownChars(article.title) f"{BOLD}{counter['exported']}.{RESET} " + highlightUnknownChars(article.title)
) )
fullPath = config.outputDir + "/" + article.getPath() fullPath: str = config.outputDir + "/" + article.getPath()
print(f"{BOLD}>{RESET} {fullPath}/index.md") print(f"{BOLD}>{RESET} {fullPath}{article.getFilename()}")
mkdir(fullPath) makedirs(fullPath, exist_ok=True)
with open(fullPath + "/index.md", "w") as f: with open(fullPath + article.getFilename(), "w") as f:
f.write(article.getArticle()) f.write(article.getArticle())
# Store detected unknown characters # Store detected unknown characters
if len(article.getUnknownChars()) > 0: if len(article.getUnknownChars()) > 0:
unknownChars[article.title] = article.getUnknownChars() unknownCharsArticles.append(article)
for title in unknownChars: for article in unknownCharsArticles:
nb = len(unknownChars[title]) unknownCharsApparitions: list = article.getUnknownChars()
nb: int = len(unknownCharsApparitions)
s: str = "s" if nb > 1 else ""
print( print(
f"\n{BOLD}{nb} " f"\n{BOLD}{nb}{RESET} unknown character{s} "
+ f"unknown character{'s' if nb > 1 else ''} detected in{RESET} " + + f"detected in article {BOLD}{article.id}{RESET}"
highlightUnknownChars(title) + f"\n{BOLD}·{RESET} "
+ highlightUnknownChars(article.title)
) )
for text in unknownChars[title]: for text in unknownCharsApparitions:
print(f" {BOLD}{RESET} " + highlightUnknownChars(text)) print(f" {BOLD}{RESET} " + highlightUnknownChars(text))
# Close the database connection db.close() # Close the database connection
db.close()