fixed most of the encoding bugs

This commit is contained in:
Guilhem Fauré 2023-05-11 11:36:23 +02:00
parent b3fa5023c4
commit 995fee5b6a

View File

@ -1,6 +1,7 @@
import re
mappings = (
# SPIP syntax to Markdown
( # horizontal-rule
re.compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", re.S | re.I),
# r"---",
@ -97,18 +98,95 @@ mappings = (
),
r"\1",
),
# Broken encoding
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1 and saved like so
re.compile("’"),
r"",
),
( # Fix UTF-8 † that was interpreted as ISO 8859-1 and saved like so
re.compile("‘"),
r"",
),
( # Fix UTF-8 é that was interpreted as ISO 8859-1 and saved like so
re.compile("\u0081"),
r"é",
),
( # Fix UTF-8 è that was interpreted as ISO 8859-1 and saved like so
re.compile("è"),
r"è",
),
( # Fix UTF-8 ê that was interpreted as ISO 8859-1 and saved like so
re.compile(""),
r"ê",
),
( # Fix UTF-8 ê that was interpreted as ISO 8859-1 and saved like so
re.compile(""),
r"ô",
),
( # Fix UTF-8 î that was interpreted as ISO 8859-1 and saved like so
re.compile(""),
r"î",
),
( # Fix UTF-8 ï that was interpreted as ISO 8859-1 and saved like so
re.compile("ˆ"),
r"ï",
),
( # Fix UTF-8 ö that was interpreted as ISO 8859-1 and saved like so
re.compile("ˆ"),
r"ö",
),
( # Fix UTF-8 ö that was interpreted as ISO 8859-1 and saved like so
re.compile("ˆ"),
r"ü",
),
( # WARNING Fix UTF-8 é ? that was interpreted as ISO 8859-1 and saved like so
re.compile(""),
r"é",
),
( # Fix UTF-8 é that was interpreted as ISO 8859-1 and saved like so
re.compile("à"),
r"à",
),
( # Fix UTF-8 … that was interpreted as ISO 8859-1 and saved like so
re.compile("…"),
r"",
),
( # Fix UTF-8 “ that was interpreted as ISO 8859-1 and saved like so
re.compile("“"),
r"",
),
( # Fix UTF-8 ” that was interpreted as ISO 8859-1 and saved like so
re.compile("â€\u009d"),
r"",
),
( # Fix UTF-8 that was interpreted as ISO 8859-1 and saved like so
re.compile("–"),
r"",
),
( # Fix UTF-8 that was interpreted as ISO 8859-1 and saved like so
re.compile("—"),
r"",
),
( # Fix UTF-8 that was interpreted as ISO 8859-1 and saved like so
re.compile("â€\u0090"),
r"",
),
( # Fix UTF-8 • that was interpreted as ISO 8859-1 and saved like so
re.compile("•"),
r"",
),
( # Fix UTF-8 † that was interpreted as ISO 8859-1 and saved like so
re.compile("†"),
r"",
),
( # Delete unknown 

re.compile("
"),
r"",
),
( # Delete unknown Ì\u0081
re.compile("Ì\u0081"),
r"",
),
)