From 107604031662df565dcdf5a345fb89f64c8871cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilhem=20Faur=C3=A9?= Date: Tue, 16 May 2023 11:29:22 +0200 Subject: [PATCH] use str.replace() instead of regex when not needed --- spip2md/converter.py | 58 ++++++++++++++++++++------------------------ spip2md/iterator.py | 10 ++++---- 2 files changed, 31 insertions(+), 37 deletions(-) diff --git a/spip2md/converter.py b/spip2md/converter.py index 0eaeddb..69214a5 100644 --- a/spip2md/converter.py +++ b/spip2md/converter.py @@ -161,106 +161,100 @@ spipToText = ( isoToUtf = ( # Broken encoding ( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1 - compile("’"), + "’", r"’", ), ( # Fix UTF-8 † that was interpreted as ISO 8859-1 - compile("‘"), + "‘", r"‘", ), ( # Fix UTF-8 é that was interpreted as ISO 8859-1 - compile("eÌ\u0081"), + "eÌ\u0081", r"é", ), ( # Fix UTF-8 è that was interpreted as ISO 8859-1 - compile("eÌ€"), + "eÌ€", r"è", ), ( # Fix UTF-8 ê that was interpreted as ISO 8859-1 - compile("eÌ‚"), + "eÌ‚", r"ê", ), ( # Fix UTF-8 ê that was interpreted as ISO 8859-1 - compile("oÌ‚"), + "oÌ‚", r"ô", ), ( # Fix UTF-8 î that was interpreted as ISO 8859-1 - compile("iÌ‚"), + "iÌ‚", r"î", ), ( # Fix UTF-8 ï that was interpreted as ISO 8859-1 - compile("ï"), + "ï", r"ï", ), ( # Fix UTF-8 ö that was interpreted as ISO 8859-1 - compile("ö"), + "ö", r"ö", ), ( # Fix UTF-8 ö that was interpreted as ISO 8859-1 - compile("ü"), + "ü", r"ü", ), ( # Fix UTF-8 é that was interpreted as ISO 8859-1 - compile("aÌ€"), + "aÌ€", r"à", ), ( # Fix UTF-8 … that was interpreted as ISO 8859-1 - compile("…"), + "…", r"…", ), ( # Fix UTF-8 “ that was interpreted as ISO 8859-1 - compile("“"), + "“", r"“", ), ( # Fix UTF-8 ” that was interpreted as ISO 8859-1 - compile("â€\u009d"), + "â€\u009d", r"”", ), ( # Fix UTF-8 – that was interpreted as ISO 8859-1 - compile("–"), + "–", r"–", ), ( # Fix UTF-8 – that was interpreted as ISO 8859-1 - compile("—"), + "—", r"—", ), ( # Fix UTF-8 − that was interpreted as ISO 8859-1 - compile("â€\u0090"), + "â€\u0090", r"−", ), ( # Fix UTF-8 • that was interpreted as ISO 8859-1 - compile("•"), + "•", r"•", ), ( # Fix UTF-8 ç that was interpreted as ISO 8859-1 - compile("ç"), + "ç", r"ç", ), ( # Fix UTF-8 í that was interpreted as ISO 8859-1 - compile("iÌ\u0081"), + "iÌ\u0081", r"í", ), # WARNING not sure ( # Fix UTF-8 é that was interpreted as ISO 8859-1 - compile("eÌ "), + "eÌ ", r"é", ), ( # Fix UTF-8 † that was interpreted as ISO 8859-1 - compile("†"), + "†", r"† ", ), ) ## WARNING unknown broken encoding unknownIso = ( - ( # unknown 
 + surroundings - compile(r"
"), - compile(r"
.*(?=\r?\n|$)"), - ), - ( # unknown â^† + surroundings - compile(r"∆"), - compile(r"∆.*(?=\r?\n|$)"), - ), + r"
", # unknown 
 + surroundings + r"∆", # unknown â^† + surroundings ) @@ -269,7 +263,7 @@ def convertBody(spipBody): for spip, markdown in spipToMarkdown: text = spip.sub(markdown, text) for iso, utf in isoToUtf: - text = iso.sub(utf, text) + text.replace(iso, utf) return text @@ -278,5 +272,5 @@ def convertMeta(spipMeta): for spip, metadata in spipToText: text = spip.sub(metadata, text) for iso, utf in isoToUtf: - text = iso.sub(utf, text) + text.replace(iso, utf) return text diff --git a/spip2md/iterator.py b/spip2md/iterator.py index e23a4e7..dd4f606 100644 --- a/spip2md/iterator.py +++ b/spip2md/iterator.py @@ -1,4 +1,4 @@ -from re import escape +from re import finditer from converter import convertBody, convertMeta, unknownIso from database import * @@ -86,8 +86,8 @@ class Article: def getUnknownChars(self): errors: list = [] for text in (self.title, self.text): - for _, surrounding in unknownIso: - for match in surrounding.finditer(text): + for char in unknownIso: + for match in finditer(char + r".*(?=\r?\n|$)", text): errors.append(match.group()) return errors @@ -97,8 +97,8 @@ def highlightUnknownChars(text): COLOR = "\033[91m" + "\033[1m" # Red + Bold RESET = "\033[0m" # Highlight in COLOR unknown chars in text - for char, _ in unknownIso: - for match in char.finditer(text): + for char in unknownIso: + for match in finditer(char, text): text = ( text[: match.start()] + COLOR