unknown characters highlighting & reporting
This commit is contained in:
parent
12db0375e7
commit
b61853a4d5
@ -253,32 +253,30 @@ isoToUtf = (
|
|||||||
|
|
||||||
## WARNING unknown broken encoding
|
## WARNING unknown broken encoding
|
||||||
unknownIso = (
|
unknownIso = (
|
||||||
compile(r"\w*
.*\r?\n"), # unknown 
 + surroundings
|
( # unknown 
 + surroundings
|
||||||
compile(r"\w*∆.*\r?\n"), # unknown â^† + surroundings
|
compile(r"
"),
|
||||||
|
compile(r"
.*(?=\r?\n|$)"),
|
||||||
|
),
|
||||||
|
( # unknown â^† + surroundings
|
||||||
|
compile(r"∆"),
|
||||||
|
compile(r"∆.*(?=\r?\n|$)"),
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def convertBody(spipBody):
|
def convertBody(spipBody):
|
||||||
text = spipBody
|
text: str = spipBody
|
||||||
errors = []
|
|
||||||
for spip, markdown in spipToMarkdown:
|
for spip, markdown in spipToMarkdown:
|
||||||
text = spip.sub(markdown, text)
|
text = spip.sub(markdown, text)
|
||||||
for iso, utf in isoToUtf:
|
for iso, utf in isoToUtf:
|
||||||
text = iso.sub(utf, text)
|
text = iso.sub(utf, text)
|
||||||
for iso in unknownIso:
|
return text
|
||||||
for match in iso.finditer(text):
|
|
||||||
errors.append(match.group())
|
|
||||||
return text, errors
|
|
||||||
|
|
||||||
|
|
||||||
def convertMeta(spipMeta):
|
def convertMeta(spipMeta):
|
||||||
text = spipMeta
|
text: str = spipMeta
|
||||||
errors = []
|
|
||||||
for spip, metadata in spipToText:
|
for spip, metadata in spipToText:
|
||||||
text = spip.sub(metadata, text)
|
text = spip.sub(metadata, text)
|
||||||
for iso, utf in isoToUtf:
|
for iso, utf in isoToUtf:
|
||||||
text = iso.sub(utf, text)
|
text = iso.sub(utf, text)
|
||||||
for iso in unknownIso:
|
return text
|
||||||
for match in iso.finditer(text):
|
|
||||||
errors.append(match.group())
|
|
||||||
return text, errors
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from array import array
|
from re import escape
|
||||||
|
|
||||||
from converter import convertBody, convertMeta
|
from converter import convertBody, convertMeta, unknownIso
|
||||||
from database import *
|
from database import *
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
# from yaml import CDumper as Dumper
|
# from yaml import CDumper as Dumper
|
||||||
@ -11,12 +11,12 @@ class Article:
|
|||||||
def __init__(self, article):
|
def __init__(self, article):
|
||||||
self.id = article.id_article
|
self.id = article.id_article
|
||||||
# self.surtitle = article.surtitre # Probably unused
|
# self.surtitle = article.surtitre # Probably unused
|
||||||
self.title, self.title_unknown = convertMeta(article.titre)
|
self.title = convertMeta(article.titre)
|
||||||
self.subtitle = article.soustitre # Probably unused
|
self.subtitle = article.soustitre # Probably unused
|
||||||
# self.section = article.id_rubrique # TODO join
|
# self.section = article.id_rubrique # TODO join
|
||||||
self.description, self.description_unknown = convertMeta(article.descriptif)
|
self.description = convertMeta(article.descriptif)
|
||||||
self.caption = article.chapo # Probably unused
|
self.caption = article.chapo # Probably unused
|
||||||
self.text, self.text_unknown = convertBody(article.texte) # Markdown
|
self.text = convertBody(article.texte) # Markdown
|
||||||
self.ps = article.ps # Probably unused
|
self.ps = article.ps # Probably unused
|
||||||
self.publicationDate = article.date
|
self.publicationDate = article.date
|
||||||
self.draft = False if article.statut == "publie" else True
|
self.draft = False if article.statut == "publie" else True
|
||||||
@ -38,16 +38,16 @@ class Article:
|
|||||||
self.virtual = article.virtuel # TODO Why ?
|
self.virtual = article.virtuel # TODO Why ?
|
||||||
self.microblog = article.microblog # Probably unused
|
self.microblog = article.microblog # Probably unused
|
||||||
|
|
||||||
def get_slug(self):
|
def getSlug(self):
|
||||||
return slugify(f"{self.id}-{self.title}")
|
return slugify(f"{self.id}-{self.title}")
|
||||||
|
|
||||||
def get_path(self):
|
def getPath(self):
|
||||||
return self.get_slug()
|
return self.getSlug()
|
||||||
|
|
||||||
def get_authors(self):
|
def getAuthors(self):
|
||||||
return SpipAuteursLiens.select().where(SpipAuteursLiens.id_objet == self.id)
|
return SpipAuteursLiens.select().where(SpipAuteursLiens.id_objet == self.id)
|
||||||
|
|
||||||
def get_frontmatter(self):
|
def getFrontmatter(self):
|
||||||
return dump(
|
return dump(
|
||||||
{
|
{
|
||||||
"lang": self.lang,
|
"lang": self.lang,
|
||||||
@ -58,14 +58,14 @@ class Article:
|
|||||||
"lastmod": self.update,
|
"lastmod": self.update,
|
||||||
"draft": self.draft,
|
"draft": self.draft,
|
||||||
"description": self.description,
|
"description": self.description,
|
||||||
"authors": [author.id_auteur for author in self.get_authors()],
|
"authors": [author.id_auteur for author in self.getAuthors()],
|
||||||
},
|
},
|
||||||
allow_unicode=True,
|
allow_unicode=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_article(self):
|
def getArticle(self):
|
||||||
# Build the final article text
|
# Build the final article text
|
||||||
article: str = "---\n" + self.get_frontmatter() + "---"
|
article: str = "---\n" + self.getFrontmatter() + "---"
|
||||||
# If there is a caption, add the caption followed by a hr
|
# If there is a caption, add the caption followed by a hr
|
||||||
if len(self.caption) > 0:
|
if len(self.caption) > 0:
|
||||||
article += "\n\n" + self.caption + "\n\n***"
|
article += "\n\n" + self.caption + "\n\n***"
|
||||||
@ -83,10 +83,34 @@ class Article:
|
|||||||
article += "\n\n# MICROBLOGGING\n\n" + self.microblog
|
article += "\n\n# MICROBLOGGING\n\n" + self.microblog
|
||||||
return article
|
return article
|
||||||
|
|
||||||
|
def getUnknownChars(self):
|
||||||
|
errors: list = []
|
||||||
|
for text in (self.title, self.text):
|
||||||
|
for _, surrounding in unknownIso:
|
||||||
|
for match in surrounding.finditer(text):
|
||||||
|
errors.append(match.group())
|
||||||
|
return errors
|
||||||
|
|
||||||
|
|
||||||
|
def highlightUnknownChars(text):
|
||||||
|
# Define terminal escape sequences to stylize output, regex escaped
|
||||||
|
COLOR = "\033[91m" + "\033[1m" # Red + Bold
|
||||||
|
RESET = "\033[0m"
|
||||||
|
# Highlight in COLOR unknown chars in text
|
||||||
|
for char, _ in unknownIso:
|
||||||
|
for match in char.finditer(text):
|
||||||
|
text = (
|
||||||
|
text[: match.start()]
|
||||||
|
+ COLOR
|
||||||
|
+ match.group()
|
||||||
|
+ RESET
|
||||||
|
+ text[match.end() :]
|
||||||
|
)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
class Articles:
|
class Articles:
|
||||||
exported: int = 0
|
exported: int = 0
|
||||||
unknownChars: list = []
|
|
||||||
|
|
||||||
def __init__(self, maxToExport) -> None:
|
def __init__(self, maxToExport) -> None:
|
||||||
# Query the DB to retrieve all articles sorted by publication date
|
# Query the DB to retrieve all articles sorted by publication date
|
||||||
@ -103,9 +127,10 @@ class Articles:
|
|||||||
|
|
||||||
def __next__(self):
|
def __next__(self):
|
||||||
if self.remaining() <= 0:
|
if self.remaining() <= 0:
|
||||||
raise StopIteration()
|
raise StopIteration
|
||||||
self.exported += 1
|
self.exported += 1
|
||||||
|
article = Article(self.articles[self.exported - 1])
|
||||||
return (
|
return (
|
||||||
{"exported": self.exported, "remaining": self.remaining()},
|
{"exported": self.exported, "remaining": self.remaining()},
|
||||||
Article(self.articles[self.exported - 1]),
|
article,
|
||||||
)
|
)
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
#!python
|
#!python
|
||||||
from config import config
|
from config import config
|
||||||
from database import db
|
from database import db
|
||||||
from iterator import Articles
|
from iterator import Articles, highlightUnknownChars
|
||||||
|
|
||||||
if __name__ != "__main__":
|
if __name__ != "__main__":
|
||||||
exit()
|
exit()
|
||||||
@ -25,12 +25,13 @@ else:
|
|||||||
maxToExport = config.defaultNbToExport
|
maxToExport = config.defaultNbToExport
|
||||||
|
|
||||||
# Define terminal escape sequences to stylize output
|
# Define terminal escape sequences to stylize output
|
||||||
R = "\033[91m"
|
R: str = "\033[91m"
|
||||||
G = "\033[92m"
|
G: str = "\033[92m"
|
||||||
B = "\033[94m"
|
B: str = "\033[94m"
|
||||||
BOLD = "\033[1m"
|
BOLD: str = "\033[1m"
|
||||||
UNDERLINE = "\033[4m"
|
RESET: str = "\033[0m"
|
||||||
RESET = "\033[0m"
|
|
||||||
|
unknownChars: dict = {}
|
||||||
|
|
||||||
# Loop among first maxToExport articles & export them
|
# Loop among first maxToExport articles & export them
|
||||||
for counter, article in Articles(maxToExport):
|
for counter, article in Articles(maxToExport):
|
||||||
@ -39,12 +40,27 @@ for counter, article in Articles(maxToExport):
|
|||||||
f"\n{BOLD}Exporting {R}{counter['remaining']+1}{RESET}"
|
f"\n{BOLD}Exporting {R}{counter['remaining']+1}{RESET}"
|
||||||
+ f"{BOLD} SPIP articles to Markdown & YAML files{RESET}\n"
|
+ f"{BOLD} SPIP articles to Markdown & YAML files{RESET}\n"
|
||||||
)
|
)
|
||||||
print(f"{BOLD}{counter['exported']}.{RESET} {article.title}")
|
print(
|
||||||
fullPath = config.outputDir + "/" + article.get_path()
|
f"{BOLD}{counter['exported']}.{RESET} " + highlightUnknownChars(article.title)
|
||||||
print(f"\t-> {fullPath}/index.md")
|
)
|
||||||
|
fullPath = config.outputDir + "/" + article.getPath()
|
||||||
|
print(f"{BOLD}>{RESET} {fullPath}/index.md")
|
||||||
mkdir(fullPath)
|
mkdir(fullPath)
|
||||||
with open(fullPath + "/index.md", "w") as f:
|
with open(fullPath + "/index.md", "w") as f:
|
||||||
f.write(article.get_article())
|
f.write(article.getArticle())
|
||||||
|
# Store detected unknown characters
|
||||||
|
if len(article.getUnknownChars()) > 0:
|
||||||
|
unknownChars[article.title] = article.getUnknownChars()
|
||||||
|
|
||||||
|
for title in unknownChars:
|
||||||
|
nb = len(unknownChars[title])
|
||||||
|
print(
|
||||||
|
f"\n{BOLD}{nb} "
|
||||||
|
+ f"unknown character{'s' if nb > 1 else ''} detected in{RESET} " +
|
||||||
|
highlightUnknownChars(title)
|
||||||
|
)
|
||||||
|
for text in unknownChars[title]:
|
||||||
|
print(f" {BOLD}…{RESET} " + highlightUnknownChars(text))
|
||||||
|
|
||||||
# Close the database connection
|
# Close the database connection
|
||||||
db.close()
|
db.close()
|
||||||
|
Loading…
Reference in New Issue
Block a user