From 107604031662df565dcdf5a345fb89f64c8871cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guilhem=20Faur=C3=A9?= <pro@gfaure.eu>
Date: Tue, 16 May 2023 11:29:22 +0200
Subject: [PATCH] use str.replace() instead of regex when not needed

---
 spip2md/converter.py | 58 ++++++++++++++++++++------------------------
 spip2md/iterator.py  | 10 ++++----
 2 files changed, 31 insertions(+), 37 deletions(-)

diff --git a/spip2md/converter.py b/spip2md/converter.py
index 0eaeddb..69214a5 100644
--- a/spip2md/converter.py
+++ b/spip2md/converter.py
@@ -161,106 +161,100 @@ spipToText = (
 isoToUtf = (
     # Broken encoding
     (  # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
-        compile("â€™"),
+        "â€™",
         r"’",
     ),
     (  # Fix UTF-8 † that was interpreted as ISO 8859-1
-        compile("â€˜"),
+        "â€˜",
         r"‘",
     ),
     (  # Fix UTF-8 é that was interpreted as ISO 8859-1
-        compile("eÌ\u0081"),
+        "eÌ\u0081",
         r"é",
     ),
     (  # Fix UTF-8 è that was interpreted as ISO 8859-1
-        compile("eÌ€"),
+        "eÌ€",
         r"è",
     ),
     (  # Fix UTF-8 ê that was interpreted as ISO 8859-1
-        compile("eÌ‚"),
+        "eÌ‚",
         r"ê",
     ),
     (  # Fix UTF-8 ê that was interpreted as ISO 8859-1
-        compile("oÌ‚"),
+        "oÌ‚",
         r"ô",
     ),
     (  # Fix UTF-8 î that was interpreted as ISO 8859-1
-        compile("iÌ‚"),
+        "iÌ‚",
         r"î",
     ),
     (  # Fix UTF-8 ï that was interpreted as ISO 8859-1
-        compile("iÌˆ"),
+        "iÌˆ",
         r"ï",
     ),
     (  # Fix UTF-8 ö that was interpreted as ISO 8859-1
-        compile("oÌˆ"),
+        "oÌˆ",
         r"ö",
     ),
     (  # Fix UTF-8 ö that was interpreted as ISO 8859-1
-        compile("uÌˆ"),
+        "uÌˆ",
         r"ü",
     ),
     (  # Fix UTF-8 é that was interpreted as ISO 8859-1
-        compile("aÌ€"),
+        "aÌ€",
         r"à",
     ),
     (  # Fix UTF-8 … that was interpreted as ISO 8859-1
-        compile("â€¦"),
+        "â€¦",
         r"…",
     ),
     (  # Fix UTF-8 “ that was interpreted as ISO 8859-1
-        compile("â€œ"),
+        "â€œ",
         r"“",
     ),
     (  # Fix UTF-8 ” that was interpreted as ISO 8859-1
-        compile("â€\u009d"),
+        "â€\u009d",
         r"”",
     ),
     (  # Fix UTF-8 – that was interpreted as ISO 8859-1
-        compile("â€“"),
+        "â€“",
         r"–",
     ),
     (  # Fix UTF-8 – that was interpreted as ISO 8859-1
-        compile("â€”"),
+        "â€”",
         r"—",
     ),
     (  # Fix UTF-8 − that was interpreted as ISO 8859-1
-        compile("â€\u0090"),
+        "â€\u0090",
         r"−",
     ),
     (  # Fix UTF-8 • that was interpreted as ISO 8859-1
-        compile("â€¢"),
+        "â€¢",
         r"•",
     ),
     (  # Fix UTF-8 ç that was interpreted as ISO 8859-1
-        compile("Ã§"),
+        "Ã§",
         r"ç",
     ),
     (  # Fix UTF-8 í that was interpreted as ISO 8859-1
-        compile("iÌ\u0081"),
+        "iÌ\u0081",
         r"í",
     ),
     # WARNING not sure
     (  # Fix UTF-8 é that was interpreted as ISO 8859-1
-        compile("eÌ "),
+        "eÌ ",
         r"é",
     ),
     (  # Fix UTF-8 † that was interpreted as ISO 8859-1
-        compile("â€ "),
+        "â€ ",
         r"† ",
     ),
 )
 
 ## WARNING unknown broken encoding
 unknownIso = (
-    (  # unknown â€¨ + surroundings
-        compile(r"â€¨"),
-        compile(r"â€¨.*(?=\r?\n|$)"),
-    ),
-    (  # unknown â^† + surroundings
-        compile(r"âˆ†"),
-        compile(r"âˆ†.*(?=\r?\n|$)"),
-    ),
+    r"â€¨",  # unknown â€¨ + surroundings
+    r"âˆ†",  # unknown â^† + surroundings
 )
 
 
@@ -269,7 +263,7 @@ def convertBody(spipBody):
     for spip, markdown in spipToMarkdown:
         text = spip.sub(markdown, text)
     for iso, utf in isoToUtf:
-        text = iso.sub(utf, text)
+        text.replace(iso, utf)
     return text
 
 
@@ -278,5 +272,5 @@ def convertMeta(spipMeta):
     for spip, metadata in spipToText:
         text = spip.sub(metadata, text)
     for iso, utf in isoToUtf:
-        text = iso.sub(utf, text)
+        text.replace(iso, utf)
     return text
diff --git a/spip2md/iterator.py b/spip2md/iterator.py
index e23a4e7..dd4f606 100644
--- a/spip2md/iterator.py
+++ b/spip2md/iterator.py
@@ -1,4 +1,4 @@
-from re import escape
+from re import finditer
 
 from converter import convertBody, convertMeta, unknownIso
 from database import *
@@ -86,8 +86,8 @@ class Article:
     def getUnknownChars(self):
         errors: list = []
         for text in (self.title, self.text):
-            for _, surrounding in unknownIso:
-                for match in surrounding.finditer(text):
+            for char in unknownIso:
+                for match in finditer(char + r".*(?=\r?\n|$)", text):
                     errors.append(match.group())
         return errors
 
@@ -97,8 +97,8 @@ def highlightUnknownChars(text):
     COLOR = "\033[91m" + "\033[1m"  # Red + Bold
     RESET = "\033[0m"
     # Highlight in COLOR unknown chars in text
-    for char, _ in unknownIso:
-        for match in char.finditer(text):
+    for char in unknownIso:
+        for match in finditer(char, text):
             text = (
                 text[: match.start()]
                 + COLOR