use str.replace() instead of regex when not needed

This commit is contained in:
Guilhem Fauré 2023-05-16 11:29:22 +02:00
parent b61853a4d5
commit 1076040316
2 changed files with 31 additions and 37 deletions

View File

@ -161,106 +161,100 @@ spipToText = (
isoToUtf = ( isoToUtf = (
# Broken encoding # Broken encoding
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1 ( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
compile("’"), "’",
r"", r"",
), ),
( # Fix UTF-8 † that was interpreted as ISO 8859-1 ( # Fix UTF-8 † that was interpreted as ISO 8859-1
compile("‘"), "‘",
r"", r"",
), ),
( # Fix UTF-8 é that was interpreted as ISO 8859-1 ( # Fix UTF-8 é that was interpreted as ISO 8859-1
compile("\u0081"), "\u0081",
r"é", r"é",
), ),
( # Fix UTF-8 è that was interpreted as ISO 8859-1 ( # Fix UTF-8 è that was interpreted as ISO 8859-1
compile("è"), "è",
r"è", r"è",
), ),
( # Fix UTF-8 ê that was interpreted as ISO 8859-1 ( # Fix UTF-8 ê that was interpreted as ISO 8859-1
compile(""), "",
r"ê", r"ê",
), ),
( # Fix UTF-8 ê that was interpreted as ISO 8859-1 ( # Fix UTF-8 ê that was interpreted as ISO 8859-1
compile(""), "",
r"ô", r"ô",
), ),
( # Fix UTF-8 î that was interpreted as ISO 8859-1 ( # Fix UTF-8 î that was interpreted as ISO 8859-1
compile(""), "",
r"î", r"î",
), ),
( # Fix UTF-8 ï that was interpreted as ISO 8859-1 ( # Fix UTF-8 ï that was interpreted as ISO 8859-1
compile("ˆ"), "ˆ",
r"ï", r"ï",
), ),
( # Fix UTF-8 ö that was interpreted as ISO 8859-1 ( # Fix UTF-8 ö that was interpreted as ISO 8859-1
compile("ˆ"), "ˆ",
r"ö", r"ö",
), ),
( # Fix UTF-8 ö that was interpreted as ISO 8859-1 ( # Fix UTF-8 ö that was interpreted as ISO 8859-1
compile("ˆ"), "ˆ",
r"ü", r"ü",
), ),
( # Fix UTF-8 é that was interpreted as ISO 8859-1 ( # Fix UTF-8 é that was interpreted as ISO 8859-1
compile("à"), "à",
r"à", r"à",
), ),
( # Fix UTF-8 … that was interpreted as ISO 8859-1 ( # Fix UTF-8 … that was interpreted as ISO 8859-1
compile("…"), "…",
r"", r"",
), ),
( # Fix UTF-8 “ that was interpreted as ISO 8859-1 ( # Fix UTF-8 “ that was interpreted as ISO 8859-1
compile("“"), "“",
r"", r"",
), ),
( # Fix UTF-8 ” that was interpreted as ISO 8859-1 ( # Fix UTF-8 ” that was interpreted as ISO 8859-1
compile("â€\u009d"), "â€\u009d",
r"", r"",
), ),
( # Fix UTF-8 that was interpreted as ISO 8859-1 ( # Fix UTF-8 that was interpreted as ISO 8859-1
compile("–"), "–",
r"", r"",
), ),
( # Fix UTF-8 that was interpreted as ISO 8859-1 ( # Fix UTF-8 that was interpreted as ISO 8859-1
compile("—"), "—",
r"", r"",
), ),
( # Fix UTF-8 that was interpreted as ISO 8859-1 ( # Fix UTF-8 that was interpreted as ISO 8859-1
compile("â€\u0090"), "â€\u0090",
r"", r"",
), ),
( # Fix UTF-8 • that was interpreted as ISO 8859-1 ( # Fix UTF-8 • that was interpreted as ISO 8859-1
compile("•"), "•",
r"", r"",
), ),
( # Fix UTF-8 ç that was interpreted as ISO 8859-1 ( # Fix UTF-8 ç that was interpreted as ISO 8859-1
compile("ç"), "ç",
r"ç", r"ç",
), ),
( # Fix UTF-8 í that was interpreted as ISO 8859-1 ( # Fix UTF-8 í that was interpreted as ISO 8859-1
compile("\u0081"), "\u0081",
r"í", r"í",
), ),
# WARNING not sure # WARNING not sure
( # Fix UTF-8 é that was interpreted as ISO 8859-1 ( # Fix UTF-8 é that was interpreted as ISO 8859-1
compile(""), "",
r"é", r"é",
), ),
( # Fix UTF-8 † that was interpreted as ISO 8859-1 ( # Fix UTF-8 † that was interpreted as ISO 8859-1
compile("†"), "†",
r"", r"",
), ),
) )
## WARNING unknown broken encoding ## WARNING unknown broken encoding
unknownIso = ( unknownIso = (
( # unknown 
 + surroundings r"
", # unknown 
 + surroundings
compile(r"
"), r"∆", # unknown â^† + surroundings
compile(r"
.*(?=\r?\n|$)"),
),
( # unknown â^† + surroundings
compile(r"∆"),
compile(r"∆.*(?=\r?\n|$)"),
),
) )
@ -269,7 +263,7 @@ def convertBody(spipBody):
for spip, markdown in spipToMarkdown: for spip, markdown in spipToMarkdown:
text = spip.sub(markdown, text) text = spip.sub(markdown, text)
for iso, utf in isoToUtf: for iso, utf in isoToUtf:
text = iso.sub(utf, text) text.replace(iso, utf)
return text return text
@ -278,5 +272,5 @@ def convertMeta(spipMeta):
for spip, metadata in spipToText: for spip, metadata in spipToText:
text = spip.sub(metadata, text) text = spip.sub(metadata, text)
for iso, utf in isoToUtf: for iso, utf in isoToUtf:
text = iso.sub(utf, text) text.replace(iso, utf)
return text return text

View File

@ -1,4 +1,4 @@
from re import escape from re import finditer
from converter import convertBody, convertMeta, unknownIso from converter import convertBody, convertMeta, unknownIso
from database import * from database import *
@ -86,8 +86,8 @@ class Article:
def getUnknownChars(self): def getUnknownChars(self):
errors: list = [] errors: list = []
for text in (self.title, self.text): for text in (self.title, self.text):
for _, surrounding in unknownIso: for char in unknownIso:
for match in surrounding.finditer(text): for match in finditer(char + r".*(?=\r?\n|$)", text):
errors.append(match.group()) errors.append(match.group())
return errors return errors
@ -97,8 +97,8 @@ def highlightUnknownChars(text):
COLOR = "\033[91m" + "\033[1m" # Red + Bold COLOR = "\033[91m" + "\033[1m" # Red + Bold
RESET = "\033[0m" RESET = "\033[0m"
# Highlight in COLOR unknown chars in text # Highlight in COLOR unknown chars in text
for char, _ in unknownIso: for char in unknownIso:
for match in char.finditer(text): for match in finditer(char, text):
text = ( text = (
text[: match.start()] text[: match.start()]
+ COLOR + COLOR