more encoding fixes, warns when unknown encoding

This commit is contained in:
Guilhem Fauré 2023-05-11 14:22:13 +02:00
parent 3e3259c564
commit 65e9f0a67b

View File

@ -1,7 +1,7 @@
import re import re
mappings = (
# SPIP syntax to Markdown # SPIP syntax to Markdown
spipToMarkdown = (
( # horizontal rule ( # horizontal rule
re.compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", re.S | re.I), re.compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", re.S | re.I),
# r"---", # r"---",
@ -99,101 +99,107 @@ mappings = (
), ),
r"\1", r"\1",
), ),
)
isoToUtf = (
# Broken encoding # Broken encoding
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
re.compile("’"), re.compile("’"),
r"", r"",
), ),
( # Fix UTF-8 † that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 † that was interpreted as ISO 8859-1
re.compile("‘"), re.compile("‘"),
r"", r"",
), ),
( # Fix UTF-8 é that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 é that was interpreted as ISO 8859-1
re.compile("\u0081"), re.compile("\u0081"),
r"é", r"é",
), ),
( # Fix UTF-8 è that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 è that was interpreted as ISO 8859-1
re.compile("è"), re.compile("è"),
r"è", r"è",
), ),
( # Fix UTF-8 ê that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 ê that was interpreted as ISO 8859-1
re.compile(""), re.compile(""),
r"ê", r"ê",
), ),
( # Fix UTF-8 ê that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 ê that was interpreted as ISO 8859-1
re.compile(""), re.compile(""),
r"ô", r"ô",
), ),
( # Fix UTF-8 î that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 î that was interpreted as ISO 8859-1
re.compile(""), re.compile(""),
r"î", r"î",
), ),
( # Fix UTF-8 ï that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 ï that was interpreted as ISO 8859-1
re.compile("ˆ"), re.compile("ˆ"),
r"ï", r"ï",
), ),
( # Fix UTF-8 ö that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 ö that was interpreted as ISO 8859-1
re.compile("ˆ"), re.compile("ˆ"),
r"ö", r"ö",
), ),
( # Fix UTF-8 ö that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 ö that was interpreted as ISO 8859-1
re.compile("ˆ"), re.compile("ˆ"),
r"ü", r"ü",
), ),
( # Fix UTF-8 é that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 é that was interpreted as ISO 8859-1
re.compile("à"), re.compile("à"),
r"à", r"à",
), ),
( # Fix UTF-8 … that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 … that was interpreted as ISO 8859-1
re.compile("…"), re.compile("…"),
r"", r"",
), ),
( # Fix UTF-8 “ that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 “ that was interpreted as ISO 8859-1
re.compile("“"), re.compile("“"),
r"", r"",
), ),
( # Fix UTF-8 ” that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 ” that was interpreted as ISO 8859-1
re.compile("â€\u009d"), re.compile("â€\u009d"),
r"", r"",
), ),
( # Fix UTF-8 that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 that was interpreted as ISO 8859-1
re.compile("–"), re.compile("–"),
r"", r"",
), ),
( # Fix UTF-8 that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 that was interpreted as ISO 8859-1
re.compile("—"), re.compile("—"),
r"", r"",
), ),
( # Fix UTF-8 that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 that was interpreted as ISO 8859-1
re.compile("â€\u0090"), re.compile("â€\u0090"),
r"", r"",
), ),
( # Fix UTF-8 • that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 • that was interpreted as ISO 8859-1
re.compile("•"), re.compile("•"),
r"", r"",
), ),
( # Fix UTF-8 † that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 í that was interpreted as ISO 8859-1
re.compile("†"), re.compile("\u0081"),
r"", r"í",
), ),
## WARNING unknown or not sure # WARNING not sure
( # Fix UTF-8 é that was interpreted as ISO 8859-1 and saved like so ( # Fix UTF-8 é that was interpreted as ISO 8859-1
re.compile(""), re.compile(""),
r"é", r"é",
), ),
( # Delete unknown 
 ( # Fix UTF-8 † that was interpreted as ISO 8859-1
re.compile("
"), re.compile("†"),
r"", r"",
),
( # Delete unknown Ì\u0081
re.compile("Ì\u0081"),
r"",
), ),
) )
## WARNING unknown broken encoding
unknownIso = (re.compile(r"\w*
.*\r?\n"),) # unknown 
 + surroundings
def convert(markup): def convert(markup):
for spip, markdown in mappings: for spip, markdown in spipToMarkdown:
markup = spip.sub(markdown, markup) markup = spip.sub(markdown, markup)
# return markup.encode("utf-8").decode("utf-8") for iso, utf in isoToUtf:
markup = iso.sub(utf, markup)
for iso in unknownIso:
for match in iso.finditer(markup):
print(f" UNKNOWN CHARACTER {match.group()}")
return markup return markup