unknown characters highlighting & reporting

This commit is contained in:
Guilhem Fauré 2023-05-16 11:19:47 +02:00
parent 12db0375e7
commit b61853a4d5
3 changed files with 80 additions and 41 deletions

View File

@ -253,32 +253,30 @@ isoToUtf = (
## WARNING unknown broken encoding ## WARNING unknown broken encoding
unknownIso = ( unknownIso = (
compile(r"\w*
.*\r?\n"), # unknown 
 + surroundings ( # unknown 
 + surroundings
compile(r"\w*∆.*\r?\n"), # unknown â^† + surroundings compile(r"
"),
compile(r"
.*(?=\r?\n|$)"),
),
( # unknown â^† + surroundings
compile(r"∆"),
compile(r"∆.*(?=\r?\n|$)"),
),
) )
def convertBody(spipBody): def convertBody(spipBody):
text = spipBody text: str = spipBody
errors = []
for spip, markdown in spipToMarkdown: for spip, markdown in spipToMarkdown:
text = spip.sub(markdown, text) text = spip.sub(markdown, text)
for iso, utf in isoToUtf: for iso, utf in isoToUtf:
text = iso.sub(utf, text) text = iso.sub(utf, text)
for iso in unknownIso: return text
for match in iso.finditer(text):
errors.append(match.group())
return text, errors
def convertMeta(spipMeta): def convertMeta(spipMeta):
text = spipMeta text: str = spipMeta
errors = []
for spip, metadata in spipToText: for spip, metadata in spipToText:
text = spip.sub(metadata, text) text = spip.sub(metadata, text)
for iso, utf in isoToUtf: for iso, utf in isoToUtf:
text = iso.sub(utf, text) text = iso.sub(utf, text)
for iso in unknownIso: return text
for match in iso.finditer(text):
errors.append(match.group())
return text, errors

View File

@ -1,6 +1,6 @@
from array import array from re import escape
from converter import convertBody, convertMeta from converter import convertBody, convertMeta, unknownIso
from database import * from database import *
from slugify import slugify from slugify import slugify
# from yaml import CDumper as Dumper # from yaml import CDumper as Dumper
@ -11,12 +11,12 @@ class Article:
def __init__(self, article): def __init__(self, article):
self.id = article.id_article self.id = article.id_article
# self.surtitle = article.surtitre # Probably unused # self.surtitle = article.surtitre # Probably unused
self.title, self.title_unknown = convertMeta(article.titre) self.title = convertMeta(article.titre)
self.subtitle = article.soustitre # Probably unused self.subtitle = article.soustitre # Probably unused
# self.section = article.id_rubrique # TODO join # self.section = article.id_rubrique # TODO join
self.description, self.description_unknown = convertMeta(article.descriptif) self.description = convertMeta(article.descriptif)
self.caption = article.chapo # Probably unused self.caption = article.chapo # Probably unused
self.text, self.text_unknown = convertBody(article.texte) # Markdown self.text = convertBody(article.texte) # Markdown
self.ps = article.ps # Probably unused self.ps = article.ps # Probably unused
self.publicationDate = article.date self.publicationDate = article.date
self.draft = False if article.statut == "publie" else True self.draft = False if article.statut == "publie" else True
@ -38,16 +38,16 @@ class Article:
self.virtual = article.virtuel # TODO Why? self.virtual = article.virtuel # TODO Why?
self.microblog = article.microblog # Probably unused self.microblog = article.microblog # Probably unused
def get_slug(self): def getSlug(self):
return slugify(f"{self.id}-{self.title}") return slugify(f"{self.id}-{self.title}")
def get_path(self): def getPath(self):
return self.get_slug() return self.getSlug()
def get_authors(self): def getAuthors(self):
return SpipAuteursLiens.select().where(SpipAuteursLiens.id_objet == self.id) return SpipAuteursLiens.select().where(SpipAuteursLiens.id_objet == self.id)
def get_frontmatter(self): def getFrontmatter(self):
return dump( return dump(
{ {
"lang": self.lang, "lang": self.lang,
@ -58,14 +58,14 @@ class Article:
"lastmod": self.update, "lastmod": self.update,
"draft": self.draft, "draft": self.draft,
"description": self.description, "description": self.description,
"authors": [author.id_auteur for author in self.get_authors()], "authors": [author.id_auteur for author in self.getAuthors()],
}, },
allow_unicode=True, allow_unicode=True,
) )
def get_article(self): def getArticle(self):
# Build the final article text # Build the final article text
article: str = "---\n" + self.get_frontmatter() + "---" article: str = "---\n" + self.getFrontmatter() + "---"
# If there is a caption, add the caption followed by a hr # If there is a caption, add the caption followed by a hr
if len(self.caption) > 0: if len(self.caption) > 0:
article += "\n\n" + self.caption + "\n\n***" article += "\n\n" + self.caption + "\n\n***"
@ -83,10 +83,34 @@ class Article:
article += "\n\n# MICROBLOGGING\n\n" + self.microblog article += "\n\n# MICROBLOGGING\n\n" + self.microblog
return article return article
def getUnknownChars(self):
errors: list = []
for text in (self.title, self.text):
for _, surrounding in unknownIso:
for match in surrounding.finditer(text):
errors.append(match.group())
return errors
def highlightUnknownChars(text):
# Define terminal escape sequences to stylize output, regex escaped
COLOR = "\033[91m" + "\033[1m" # Red + Bold
RESET = "\033[0m"
# Highlight in COLOR unknown chars in text
for char, _ in unknownIso:
for match in char.finditer(text):
text = (
text[: match.start()]
+ COLOR
+ match.group()
+ RESET
+ text[match.end() :]
)
return text
class Articles: class Articles:
exported: int = 0 exported: int = 0
unknownChars: list = []
def __init__(self, maxToExport) -> None: def __init__(self, maxToExport) -> None:
# Query the DB to retrieve all articles sorted by publication date # Query the DB to retrieve all articles sorted by publication date
@ -103,9 +127,10 @@ class Articles:
def __next__(self): def __next__(self):
if self.remaining() <= 0: if self.remaining() <= 0:
raise StopIteration() raise StopIteration
self.exported += 1 self.exported += 1
article = Article(self.articles[self.exported - 1])
return ( return (
{"exported": self.exported, "remaining": self.remaining()}, {"exported": self.exported, "remaining": self.remaining()},
Article(self.articles[self.exported - 1]), article,
) )

View File

@ -1,7 +1,7 @@
#!python #!python
from config import config from config import config
from database import db from database import db
from iterator import Articles from iterator import Articles, highlightUnknownChars
if __name__ != "__main__": if __name__ != "__main__":
exit() exit()
@ -25,12 +25,13 @@ else:
maxToExport = config.defaultNbToExport maxToExport = config.defaultNbToExport
# Define terminal escape sequences to stylize output # Define terminal escape sequences to stylize output
R = "\033[91m" R: str = "\033[91m"
G = "\033[92m" G: str = "\033[92m"
B = "\033[94m" B: str = "\033[94m"
BOLD = "\033[1m" BOLD: str = "\033[1m"
UNDERLINE = "\033[4m" RESET: str = "\033[0m"
RESET = "\033[0m"
unknownChars: dict = {}
# Loop among first maxToExport articles & export them # Loop among first maxToExport articles & export them
for counter, article in Articles(maxToExport): for counter, article in Articles(maxToExport):
@ -39,12 +40,27 @@ for counter, article in Articles(maxToExport):
f"\n{BOLD}Exporting {R}{counter['remaining']+1}{RESET}" f"\n{BOLD}Exporting {R}{counter['remaining']+1}{RESET}"
+ f"{BOLD} SPIP articles to Markdown & YAML files{RESET}\n" + f"{BOLD} SPIP articles to Markdown & YAML files{RESET}\n"
) )
print(f"{BOLD}{counter['exported']}.{RESET} {article.title}") print(
fullPath = config.outputDir + "/" + article.get_path() f"{BOLD}{counter['exported']}.{RESET} " + highlightUnknownChars(article.title)
print(f"\t-> {fullPath}/index.md") )
fullPath = config.outputDir + "/" + article.getPath()
print(f"{BOLD}>{RESET} {fullPath}/index.md")
mkdir(fullPath) mkdir(fullPath)
with open(fullPath + "/index.md", "w") as f: with open(fullPath + "/index.md", "w") as f:
f.write(article.get_article()) f.write(article.getArticle())
# Store detected unknown characters
if len(article.getUnknownChars()) > 0:
unknownChars[article.title] = article.getUnknownChars()
for title in unknownChars:
nb = len(unknownChars[title])
print(
f"\n{BOLD}{nb} "
+ f"unknown character{'s' if nb > 1 else ''} detected in{RESET} " +
highlightUnknownChars(title)
)
for text in unknownChars[title]:
print(f" {BOLD}{RESET} " + highlightUnknownChars(text))
# Close the database connection # Close the database connection
db.close() db.close()