use str.replace() instead of regex when not needed
This commit is contained in:
parent
b61853a4d5
commit
1076040316
@ -161,106 +161,100 @@ spipToText = (
|
||||
isoToUtf = (
|
||||
# Broken encoding
|
||||
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
|
||||
compile("’"),
|
||||
"’",
|
||||
r"’",
|
||||
),
|
||||
( # Fix UTF-8 † that was interpreted as ISO 8859-1
|
||||
compile("‘"),
|
||||
"‘",
|
||||
r"‘",
|
||||
),
|
||||
( # Fix UTF-8 é that was interpreted as ISO 8859-1
|
||||
compile("eÌ\u0081"),
|
||||
"eÌ\u0081",
|
||||
r"é",
|
||||
),
|
||||
( # Fix UTF-8 è that was interpreted as ISO 8859-1
|
||||
compile("è"),
|
||||
"è",
|
||||
r"è",
|
||||
),
|
||||
( # Fix UTF-8 ê that was interpreted as ISO 8859-1
|
||||
compile("ê"),
|
||||
"ê",
|
||||
r"ê",
|
||||
),
|
||||
( # Fix UTF-8 ê that was interpreted as ISO 8859-1
|
||||
compile("ô"),
|
||||
"ô",
|
||||
r"ô",
|
||||
),
|
||||
( # Fix UTF-8 î that was interpreted as ISO 8859-1
|
||||
compile("î"),
|
||||
"î",
|
||||
r"î",
|
||||
),
|
||||
( # Fix UTF-8 ï that was interpreted as ISO 8859-1
|
||||
compile("ï"),
|
||||
"ï",
|
||||
r"ï",
|
||||
),
|
||||
( # Fix UTF-8 ö that was interpreted as ISO 8859-1
|
||||
compile("ö"),
|
||||
"ö",
|
||||
r"ö",
|
||||
),
|
||||
( # Fix UTF-8 ö that was interpreted as ISO 8859-1
|
||||
compile("ü"),
|
||||
"ü",
|
||||
r"ü",
|
||||
),
|
||||
( # Fix UTF-8 é that was interpreted as ISO 8859-1
|
||||
compile("à"),
|
||||
"à",
|
||||
r"à",
|
||||
),
|
||||
( # Fix UTF-8 … that was interpreted as ISO 8859-1
|
||||
compile("…"),
|
||||
"…",
|
||||
r"…",
|
||||
),
|
||||
( # Fix UTF-8 “ that was interpreted as ISO 8859-1
|
||||
compile("“"),
|
||||
"“",
|
||||
r"“",
|
||||
),
|
||||
( # Fix UTF-8 ” that was interpreted as ISO 8859-1
|
||||
compile("â€\u009d"),
|
||||
"â€\u009d",
|
||||
r"”",
|
||||
),
|
||||
( # Fix UTF-8 – that was interpreted as ISO 8859-1
|
||||
compile("–"),
|
||||
"–",
|
||||
r"–",
|
||||
),
|
||||
( # Fix UTF-8 – that was interpreted as ISO 8859-1
|
||||
compile("—"),
|
||||
"—",
|
||||
r"—",
|
||||
),
|
||||
( # Fix UTF-8 − that was interpreted as ISO 8859-1
|
||||
compile("â€\u0090"),
|
||||
"â€\u0090",
|
||||
r"−",
|
||||
),
|
||||
( # Fix UTF-8 • that was interpreted as ISO 8859-1
|
||||
compile("•"),
|
||||
"•",
|
||||
r"•",
|
||||
),
|
||||
( # Fix UTF-8 ç that was interpreted as ISO 8859-1
|
||||
compile("ç"),
|
||||
"ç",
|
||||
r"ç",
|
||||
),
|
||||
( # Fix UTF-8 í that was interpreted as ISO 8859-1
|
||||
compile("iÌ\u0081"),
|
||||
"iÌ\u0081",
|
||||
r"í",
|
||||
),
|
||||
# WARNING not sure
|
||||
( # Fix UTF-8 é that was interpreted as ISO 8859-1
|
||||
compile("eÌ "),
|
||||
"eÌ ",
|
||||
r"é",
|
||||
),
|
||||
( # Fix UTF-8 † that was interpreted as ISO 8859-1
|
||||
compile("†"),
|
||||
"†",
|
||||
r"† ",
|
||||
),
|
||||
)
|
||||
|
||||
## WARNING unknown broken encoding
|
||||
unknownIso = (
|
||||
( # unknown 
 + surroundings
|
||||
compile(r"
"),
|
||||
compile(r"
.*(?=\r?\n|$)"),
|
||||
),
|
||||
( # unknown â^† + surroundings
|
||||
compile(r"∆"),
|
||||
compile(r"∆.*(?=\r?\n|$)"),
|
||||
),
|
||||
r"
", # unknown 
 + surroundings
|
||||
r"∆", # unknown â^† + surroundings
|
||||
)
|
||||
|
||||
|
||||
@ -269,7 +263,7 @@ def convertBody(spipBody):
|
||||
for spip, markdown in spipToMarkdown:
|
||||
text = spip.sub(markdown, text)
|
||||
for iso, utf in isoToUtf:
|
||||
text = iso.sub(utf, text)
|
||||
text.replace(iso, utf)
|
||||
return text
|
||||
|
||||
|
||||
@ -278,5 +272,5 @@ def convertMeta(spipMeta):
|
||||
for spip, metadata in spipToText:
|
||||
text = spip.sub(metadata, text)
|
||||
for iso, utf in isoToUtf:
|
||||
text = iso.sub(utf, text)
|
||||
text.replace(iso, utf)
|
||||
return text
|
||||
|
@ -1,4 +1,4 @@
|
||||
from re import escape
|
||||
from re import finditer
|
||||
|
||||
from converter import convertBody, convertMeta, unknownIso
|
||||
from database import *
|
||||
@ -86,8 +86,8 @@ class Article:
|
||||
def getUnknownChars(self):
|
||||
errors: list = []
|
||||
for text in (self.title, self.text):
|
||||
for _, surrounding in unknownIso:
|
||||
for match in surrounding.finditer(text):
|
||||
for char in unknownIso:
|
||||
for match in finditer(char + r".*(?=\r?\n|$)", text):
|
||||
errors.append(match.group())
|
||||
return errors
|
||||
|
||||
@ -97,8 +97,8 @@ def highlightUnknownChars(text):
|
||||
COLOR = "\033[91m" + "\033[1m" # Red + Bold
|
||||
RESET = "\033[0m"
|
||||
# Highlight in COLOR unknown chars in text
|
||||
for char, _ in unknownIso:
|
||||
for match in char.finditer(text):
|
||||
for char in unknownIso:
|
||||
for match in finditer(char, text):
|
||||
text = (
|
||||
text[: match.start()]
|
||||
+ COLOR
|
||||
|
Loading…
Reference in New Issue
Block a user