use str.replace() instead of regex when not needed

This commit is contained in:
Guilhem Fauré 2023-05-16 11:29:22 +02:00
parent b61853a4d5
commit 1076040316
2 changed files with 31 additions and 37 deletions

View File

@ -161,106 +161,100 @@ spipToText = (
isoToUtf = (
# Broken encoding
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
compile("’"),
"’",
r"",
),
( # Fix UTF-8 † that was interpreted as ISO 8859-1
compile("‘"),
"‘",
r"",
),
( # Fix UTF-8 é that was interpreted as ISO 8859-1
compile("\u0081"),
"\u0081",
r"é",
),
( # Fix UTF-8 è that was interpreted as ISO 8859-1
compile("è"),
"è",
r"è",
),
( # Fix UTF-8 ê that was interpreted as ISO 8859-1
compile(""),
"",
r"ê",
),
( # Fix UTF-8 ê that was interpreted as ISO 8859-1
compile(""),
"",
r"ô",
),
( # Fix UTF-8 î that was interpreted as ISO 8859-1
compile(""),
"",
r"î",
),
( # Fix UTF-8 ï that was interpreted as ISO 8859-1
compile("ˆ"),
"ˆ",
r"ï",
),
( # Fix UTF-8 ö that was interpreted as ISO 8859-1
compile("ˆ"),
"ˆ",
r"ö",
),
( # Fix UTF-8 ö that was interpreted as ISO 8859-1
compile("ˆ"),
"ˆ",
r"ü",
),
( # Fix UTF-8 é that was interpreted as ISO 8859-1
compile("à"),
"à",
r"à",
),
( # Fix UTF-8 … that was interpreted as ISO 8859-1
compile("…"),
"…",
r"",
),
( # Fix UTF-8 “ that was interpreted as ISO 8859-1
compile("“"),
"“",
r"",
),
( # Fix UTF-8 ” that was interpreted as ISO 8859-1
compile("â€\u009d"),
"â€\u009d",
r"",
),
( # Fix UTF-8 that was interpreted as ISO 8859-1
compile("–"),
"–",
r"",
),
( # Fix UTF-8 that was interpreted as ISO 8859-1
compile("—"),
"—",
r"",
),
( # Fix UTF-8 that was interpreted as ISO 8859-1
compile("â€\u0090"),
"â€\u0090",
r"",
),
( # Fix UTF-8 • that was interpreted as ISO 8859-1
compile("•"),
"•",
r"",
),
( # Fix UTF-8 ç that was interpreted as ISO 8859-1
compile("ç"),
"ç",
r"ç",
),
( # Fix UTF-8 í that was interpreted as ISO 8859-1
compile("\u0081"),
"\u0081",
r"í",
),
# WARNING not sure
( # Fix UTF-8 é that was interpreted as ISO 8859-1
compile(""),
"",
r"é",
),
( # Fix UTF-8 † that was interpreted as ISO 8859-1
compile("†"),
"†",
r"",
),
)
## WARNING unknown broken encoding
unknownIso = (
( # unknown 
 + surroundings
compile(r"
"),
compile(r"
.*(?=\r?\n|$)"),
),
( # unknown â^† + surroundings
compile(r"∆"),
compile(r"∆.*(?=\r?\n|$)"),
),
r"
", # unknown 
 + surroundings
r"∆", # unknown â^† + surroundings
)
@ -269,7 +263,7 @@ def convertBody(spipBody):
for spip, markdown in spipToMarkdown:
text = spip.sub(markdown, text)
for iso, utf in isoToUtf:
text = iso.sub(utf, text)
text.replace(iso, utf)
return text
@ -278,5 +272,5 @@ def convertMeta(spipMeta):
for spip, metadata in spipToText:
text = spip.sub(metadata, text)
for iso, utf in isoToUtf:
text = iso.sub(utf, text)
text.replace(iso, utf)
return text

View File

@ -1,4 +1,4 @@
from re import escape
from re import finditer
from converter import convertBody, convertMeta, unknownIso
from database import *
@ -86,8 +86,8 @@ class Article:
def getUnknownChars(self):
errors: list = []
for text in (self.title, self.text):
for _, surrounding in unknownIso:
for match in surrounding.finditer(text):
for char in unknownIso:
for match in finditer(char + r".*(?=\r?\n|$)", text):
errors.append(match.group())
return errors
@ -97,8 +97,8 @@ def highlightUnknownChars(text):
COLOR = "\033[91m" + "\033[1m" # Red + Bold
RESET = "\033[0m"
# Highlight in COLOR unknown chars in text
for char, _ in unknownIso:
for match in char.finditer(text):
for char in unknownIso:
for match in finditer(char, text):
text = (
text[: match.start()]
+ COLOR