more encoding fixes, warns when unknown encoding
This commit is contained in:
parent
3e3259c564
commit
65e9f0a67b
@ -1,7 +1,7 @@
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
mappings = (
|
# SPIP syntax to Markdown
|
||||||
# SPIP syntax to Markdown
|
spipToMarkdown = (
|
||||||
( # horizontal rule
|
( # horizontal rule
|
||||||
re.compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", re.S | re.I),
|
re.compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", re.S | re.I),
|
||||||
# r"---",
|
# r"---",
|
||||||
@ -99,101 +99,107 @@ mappings = (
|
|||||||
),
|
),
|
||||||
r"\1",
|
r"\1",
|
||||||
),
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
isoToUtf = (
|
||||||
# Broken encoding
|
# Broken encoding
|
||||||
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
|
||||||
re.compile("’"),
|
re.compile("’"),
|
||||||
r"’",
|
r"’",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 † that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 † that was interpreted as ISO 8859-1
|
||||||
re.compile("‘"),
|
re.compile("‘"),
|
||||||
r"‘",
|
r"‘",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 é that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 é that was interpreted as ISO 8859-1
|
||||||
re.compile("eÌ\u0081"),
|
re.compile("eÌ\u0081"),
|
||||||
r"é",
|
r"é",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 è that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 è that was interpreted as ISO 8859-1
|
||||||
re.compile("è"),
|
re.compile("è"),
|
||||||
r"è",
|
r"è",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 ê that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 ê that was interpreted as ISO 8859-1
|
||||||
re.compile("ê"),
|
re.compile("ê"),
|
||||||
r"ê",
|
r"ê",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 ê that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 ê that was interpreted as ISO 8859-1
|
||||||
re.compile("ô"),
|
re.compile("ô"),
|
||||||
r"ô",
|
r"ô",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 î that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 î that was interpreted as ISO 8859-1
|
||||||
re.compile("î"),
|
re.compile("î"),
|
||||||
r"î",
|
r"î",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 ï that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 ï that was interpreted as ISO 8859-1
|
||||||
re.compile("ï"),
|
re.compile("ï"),
|
||||||
r"ï",
|
r"ï",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 ö that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 ö that was interpreted as ISO 8859-1
|
||||||
re.compile("ö"),
|
re.compile("ö"),
|
||||||
r"ö",
|
r"ö",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 ö that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 ö that was interpreted as ISO 8859-1
|
||||||
re.compile("ü"),
|
re.compile("ü"),
|
||||||
r"ü",
|
r"ü",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 é that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 é that was interpreted as ISO 8859-1
|
||||||
re.compile("à"),
|
re.compile("à"),
|
||||||
r"à",
|
r"à",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 … that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 … that was interpreted as ISO 8859-1
|
||||||
re.compile("…"),
|
re.compile("…"),
|
||||||
r"…",
|
r"…",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 “ that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 “ that was interpreted as ISO 8859-1
|
||||||
re.compile("“"),
|
re.compile("“"),
|
||||||
r"“",
|
r"“",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 ” that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 ” that was interpreted as ISO 8859-1
|
||||||
re.compile("â€\u009d"),
|
re.compile("â€\u009d"),
|
||||||
r"”",
|
r"”",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 – that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 – that was interpreted as ISO 8859-1
|
||||||
re.compile("–"),
|
re.compile("–"),
|
||||||
r"–",
|
r"–",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 – that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 – that was interpreted as ISO 8859-1
|
||||||
re.compile("—"),
|
re.compile("—"),
|
||||||
r"—",
|
r"—",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 − that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 − that was interpreted as ISO 8859-1
|
||||||
re.compile("â€\u0090"),
|
re.compile("â€\u0090"),
|
||||||
r"−",
|
r"−",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 • that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 • that was interpreted as ISO 8859-1
|
||||||
re.compile("•"),
|
re.compile("•"),
|
||||||
r"•",
|
r"•",
|
||||||
),
|
),
|
||||||
( # Fix UTF-8 † that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 í that was interpreted as ISO 8859-1
|
||||||
re.compile("†"),
|
re.compile("iÌ\u0081"),
|
||||||
r"† ",
|
r"í",
|
||||||
),
|
),
|
||||||
## WARNING unknown or not sure
|
# WARNING not sure
|
||||||
( # Fix UTF-8 é that was interpreted as ISO 8859-1 and saved like so
|
( # Fix UTF-8 é that was interpreted as ISO 8859-1
|
||||||
re.compile("eÌ "),
|
re.compile("eÌ "),
|
||||||
r"é",
|
r"é",
|
||||||
),
|
),
|
||||||
( # Delete unknown 

|
( # Fix UTF-8 † that was interpreted as ISO 8859-1
|
||||||
re.compile("
"),
|
re.compile("†"),
|
||||||
r"",
|
r"† ",
|
||||||
),
|
|
||||||
( # Delete unknown Ì\u0081
|
|
||||||
re.compile("Ì\u0081"),
|
|
||||||
r"",
|
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
## WARNING unknown broken encoding
|
||||||
|
unknownIso = (re.compile(r"\w*
.*\r?\n"),) # unknown 
 + surroundings
|
||||||
|
|
||||||
|
|
||||||
def convert(markup):
|
def convert(markup):
|
||||||
for spip, markdown in mappings:
|
for spip, markdown in spipToMarkdown:
|
||||||
markup = spip.sub(markdown, markup)
|
markup = spip.sub(markdown, markup)
|
||||||
# return markup.encode("utf-8").decode("utf-8")
|
for iso, utf in isoToUtf:
|
||||||
|
markup = iso.sub(utf, markup)
|
||||||
|
for iso in unknownIso:
|
||||||
|
for match in iso.finditer(markup):
|
||||||
|
print(f" UNKNOWN CHARACTER {match.group()}")
|
||||||
return markup
|
return markup
|
||||||
|
Loading…
Reference in New Issue
Block a user