From bc616cc7a11acb53c4dc67ac94fa8fb84f0f6dc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilhem=20Faur=C3=A9?= Date: Mon, 15 May 2023 17:18:36 +0200 Subject: [PATCH] started allowing to gather unknown encoding bugs --- spip2md/converter.py | 10 ++++++---- spip2md/iterator.py | 6 +++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/spip2md/converter.py b/spip2md/converter.py index 761b3ac..30e1513 100644 --- a/spip2md/converter.py +++ b/spip2md/converter.py @@ -257,23 +257,25 @@ unknownIso = (compile(r"\w*
.*\r?\n"),) # unknown 
 + surroundings def convertBody(spipBody): text = spipBody + errors = [] for spip, markdown in spipToMarkdown: text = spip.sub(markdown, text) for iso, utf in isoToUtf: text = iso.sub(utf, text) for iso in unknownIso: for match in iso.finditer(text): - print(f" UNKNOWN CHARACTER {match.group()}") - return text + errors.append(match.group()) + return text, errors def convertMeta(spipMeta): text = spipMeta + errors = [] for spip, metadata in spipToText: text = spip.sub(metadata, text) for iso, utf in isoToUtf: text = iso.sub(utf, text) for iso in unknownIso: for match in iso.finditer(text): - print(f" UNKNOWN CHARACTER {match.group()}") - return text + errors.append(match.group()) + return text, errors diff --git a/spip2md/iterator.py b/spip2md/iterator.py index b584241..a359a6d 100644 --- a/spip2md/iterator.py +++ b/spip2md/iterator.py @@ -9,12 +9,12 @@ class Article: def __init__(self, article): self.id = article.id_article # self.surtitle = article.surtitre # Probably unused - self.title = convertMeta(article.titre) + self.title, self.title_unknown = convertMeta(article.titre) self.subtitle = article.soustitre # Probably unused # self.section = article.id_rubrique # TODO join - self.description = convertMeta(article.descriptif) + self.description, self.description_unknown = convertMeta(article.descriptif) self.caption = article.chapo # Probably unused - self.text = convertBody(article.texte) # Markdown + self.text, self.text_unknown = convertBody(article.texte) # Markdown self.ps = article.ps # Probably unused self.publicationDate = article.date self.draft = False if article.statut == "publie" else True