started allowing to gather unknown encoding bugs

This commit is contained in:
Guilhem Fauré 2023-05-15 17:18:36 +02:00
parent b8f99fb329
commit bc616cc7a1
2 changed files with 9 additions and 7 deletions

View File

@ -257,23 +257,25 @@ unknownIso = (compile(r"\w*
.*\r?\n"),) # unknown 
 + surroundings
def convertBody(spipBody):
text = spipBody
errors = []
for spip, markdown in spipToMarkdown:
text = spip.sub(markdown, text)
for iso, utf in isoToUtf:
text = iso.sub(utf, text)
for iso in unknownIso:
for match in iso.finditer(text):
print(f" UNKNOWN CHARACTER {match.group()}")
return text
errors.append(match.group())
return text, errors
def convertMeta(spipMeta):
text = spipMeta
errors = []
for spip, metadata in spipToText:
text = spip.sub(metadata, text)
for iso, utf in isoToUtf:
text = iso.sub(utf, text)
for iso in unknownIso:
for match in iso.finditer(text):
print(f" UNKNOWN CHARACTER {match.group()}")
return text
errors.append(match.group())
return text, errors

View File

@ -9,12 +9,12 @@ class Article:
def __init__(self, article):
self.id = article.id_article
# self.surtitle = article.surtitre # Probably unused
self.title = convertMeta(article.titre)
self.title, self.title_unknown = convertMeta(article.titre)
self.subtitle = article.soustitre # Probably unused
# self.section = article.id_rubrique # TODO join
self.description = convertMeta(article.descriptif)
self.description, self.description_unknown = convertMeta(article.descriptif)
self.caption = article.chapo # Probably unused
self.text = convertBody(article.texte) # Markdown
self.text, self.text_unknown = convertBody(article.texte) # Markdown
self.ps = article.ps # Probably unused
self.publicationDate = article.date
self.draft = False if article.statut == "publie" else True