use str.replace() instead of regex when not needed
This commit is contained in:
parent
b61853a4d5
commit
1076040316
@ -161,106 +161,100 @@ spipToText = (
|
|||||||
isoToUtf = (
|
isoToUtf = (
|
||||||
# Broken encoding
|
# Broken encoding
|
||||||
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
|
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
|
||||||
compile("’"),
|
"’",
|
||||||
r"’",
|
r"’",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 † that was interpreted as ISO 8859-1
|
( # Fix UTF-8 † that was interpreted as ISO 8859-1
|
||||||
compile("‘"),
|
"‘",
|
||||||
r"‘",
|
r"‘",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 é that was interpreted as ISO 8859-1
|
( # Fix UTF-8 é that was interpreted as ISO 8859-1
|
||||||
compile("eÌ\u0081"),
|
"eÌ\u0081",
|
||||||
r"é",
|
r"é",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 è that was interpreted as ISO 8859-1
|
( # Fix UTF-8 è that was interpreted as ISO 8859-1
|
||||||
compile("è"),
|
"è",
|
||||||
r"è",
|
r"è",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 ê that was interpreted as ISO 8859-1
|
( # Fix UTF-8 ê that was interpreted as ISO 8859-1
|
||||||
compile("ê"),
|
"ê",
|
||||||
r"ê",
|
r"ê",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 ê that was interpreted as ISO 8859-1
|
( # Fix UTF-8 ê that was interpreted as ISO 8859-1
|
||||||
compile("ô"),
|
"ô",
|
||||||
r"ô",
|
r"ô",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 î that was interpreted as ISO 8859-1
|
( # Fix UTF-8 î that was interpreted as ISO 8859-1
|
||||||
compile("î"),
|
"î",
|
||||||
r"î",
|
r"î",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 ï that was interpreted as ISO 8859-1
|
( # Fix UTF-8 ï that was interpreted as ISO 8859-1
|
||||||
compile("ï"),
|
"ï",
|
||||||
r"ï",
|
r"ï",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 ö that was interpreted as ISO 8859-1
|
( # Fix UTF-8 ö that was interpreted as ISO 8859-1
|
||||||
compile("ö"),
|
"ö",
|
||||||
r"ö",
|
r"ö",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 ö that was interpreted as ISO 8859-1
|
( # Fix UTF-8 ö that was interpreted as ISO 8859-1
|
||||||
compile("ü"),
|
"ü",
|
||||||
r"ü",
|
r"ü",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 é that was interpreted as ISO 8859-1
|
( # Fix UTF-8 é that was interpreted as ISO 8859-1
|
||||||
compile("à"),
|
"à",
|
||||||
r"à",
|
r"à",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 … that was interpreted as ISO 8859-1
|
( # Fix UTF-8 … that was interpreted as ISO 8859-1
|
||||||
compile("…"),
|
"…",
|
||||||
r"…",
|
r"…",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 “ that was interpreted as ISO 8859-1
|
( # Fix UTF-8 “ that was interpreted as ISO 8859-1
|
||||||
compile("“"),
|
"“",
|
||||||
r"“",
|
r"“",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 ” that was interpreted as ISO 8859-1
|
( # Fix UTF-8 ” that was interpreted as ISO 8859-1
|
||||||
compile("â€\u009d"),
|
"â€\u009d",
|
||||||
r"”",
|
r"”",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 – that was interpreted as ISO 8859-1
|
( # Fix UTF-8 – that was interpreted as ISO 8859-1
|
||||||
compile("–"),
|
"–",
|
||||||
r"–",
|
r"–",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 – that was interpreted as ISO 8859-1
|
( # Fix UTF-8 – that was interpreted as ISO 8859-1
|
||||||
compile("—"),
|
"—",
|
||||||
r"—",
|
r"—",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 − that was interpreted as ISO 8859-1
|
( # Fix UTF-8 − that was interpreted as ISO 8859-1
|
||||||
compile("â€\u0090"),
|
"â€\u0090",
|
||||||
r"−",
|
r"−",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 • that was interpreted as ISO 8859-1
|
( # Fix UTF-8 • that was interpreted as ISO 8859-1
|
||||||
compile("•"),
|
"•",
|
||||||
r"•",
|
r"•",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 ç that was interpreted as ISO 8859-1
|
( # Fix UTF-8 ç that was interpreted as ISO 8859-1
|
||||||
compile("ç"),
|
"ç",
|
||||||
r"ç",
|
r"ç",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 í that was interpreted as ISO 8859-1
|
( # Fix UTF-8 í that was interpreted as ISO 8859-1
|
||||||
compile("iÌ\u0081"),
|
"iÌ\u0081",
|
||||||
r"í",
|
r"í",
|
||||||
),
|
),
|
||||||
# WARNING not sure
|
# WARNING not sure
|
||||||
( # Fix UTF-8 é that was interpreted as ISO 8859-1
|
( # Fix UTF-8 é that was interpreted as ISO 8859-1
|
||||||
compile("eÌ "),
|
"eÌ ",
|
||||||
r"é",
|
r"é",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 † that was interpreted as ISO 8859-1
|
( # Fix UTF-8 † that was interpreted as ISO 8859-1
|
||||||
compile("†"),
|
"†",
|
||||||
r"† ",
|
r"† ",
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
## WARNING unknown broken encoding
|
## WARNING unknown broken encoding
|
||||||
unknownIso = (
|
unknownIso = (
|
||||||
( # unknown 
 + surroundings
|
r"
", # unknown 
 + surroundings
|
||||||
compile(r"
"),
|
r"∆", # unknown â^† + surroundings
|
||||||
compile(r"
.*(?=\r?\n|$)"),
|
|
||||||
),
|
|
||||||
( # unknown â^† + surroundings
|
|
||||||
compile(r"∆"),
|
|
||||||
compile(r"∆.*(?=\r?\n|$)"),
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -269,7 +263,7 @@ def convertBody(spipBody):
|
|||||||
for spip, markdown in spipToMarkdown:
|
for spip, markdown in spipToMarkdown:
|
||||||
text = spip.sub(markdown, text)
|
text = spip.sub(markdown, text)
|
||||||
for iso, utf in isoToUtf:
|
for iso, utf in isoToUtf:
|
||||||
text = iso.sub(utf, text)
|
text.replace(iso, utf)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
@ -278,5 +272,5 @@ def convertMeta(spipMeta):
|
|||||||
for spip, metadata in spipToText:
|
for spip, metadata in spipToText:
|
||||||
text = spip.sub(metadata, text)
|
text = spip.sub(metadata, text)
|
||||||
for iso, utf in isoToUtf:
|
for iso, utf in isoToUtf:
|
||||||
text = iso.sub(utf, text)
|
text.replace(iso, utf)
|
||||||
return text
|
return text
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from re import escape
|
from re import finditer
|
||||||
|
|
||||||
from converter import convertBody, convertMeta, unknownIso
|
from converter import convertBody, convertMeta, unknownIso
|
||||||
from database import *
|
from database import *
|
||||||
@ -86,8 +86,8 @@ class Article:
|
|||||||
def getUnknownChars(self):
|
def getUnknownChars(self):
|
||||||
errors: list = []
|
errors: list = []
|
||||||
for text in (self.title, self.text):
|
for text in (self.title, self.text):
|
||||||
for _, surrounding in unknownIso:
|
for char in unknownIso:
|
||||||
for match in surrounding.finditer(text):
|
for match in finditer(char + r".*(?=\r?\n|$)", text):
|
||||||
errors.append(match.group())
|
errors.append(match.group())
|
||||||
return errors
|
return errors
|
||||||
|
|
||||||
@ -97,8 +97,8 @@ def highlightUnknownChars(text):
|
|||||||
COLOR = "\033[91m" + "\033[1m" # Red + Bold
|
COLOR = "\033[91m" + "\033[1m" # Red + Bold
|
||||||
RESET = "\033[0m"
|
RESET = "\033[0m"
|
||||||
# Highlight in COLOR unknown chars in text
|
# Highlight in COLOR unknown chars in text
|
||||||
for char, _ in unknownIso:
|
for char in unknownIso:
|
||||||
for match in char.finditer(text):
|
for match in finditer(char, text):
|
||||||
text = (
|
text = (
|
||||||
text[: match.start()]
|
text[: match.start()]
|
||||||
+ COLOR
|
+ COLOR
|
||||||
|
Loading…
Reference in New Issue
Block a user