diff --git a/spip2md/convert.py b/spip2md/convert.py
index f3eb001..4497222 100644
--- a/spip2md/convert.py
+++ b/spip2md/convert.py
@@ -1,7 +1,7 @@
import re
-mappings = (
- # SPIP syntax to Markdown
+# SPIP syntax to Markdown
+spipToMarkdown = (
( # horizontal rule
re.compile(r"- ?- ?- ?- ?[\- ]*|
", re.S | re.I),
# r"---",
@@ -99,101 +99,107 @@ mappings = (
),
r"\1",
),
+)
+
+isoToUtf = (
# Broken encoding
- ( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1 and saved like so
+ ( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
re.compile("’"),
r"’",
),
- ( # Fix UTF-8 † that was interpreted as ISO 8859-1 and saved like so
+ ( # Fix UTF-8 † that was interpreted as ISO 8859-1
re.compile("‘"),
r"‘",
),
- ( # Fix UTF-8 é that was interpreted as ISO 8859-1 and saved like so
+ ( # Fix UTF-8 é that was interpreted as ISO 8859-1
re.compile("eÌ\u0081"),
r"é",
),
- ( # Fix UTF-8 è that was interpreted as ISO 8859-1 and saved like so
+ ( # Fix UTF-8 è that was interpreted as ISO 8859-1
re.compile("è"),
r"è",
),
- ( # Fix UTF-8 ê that was interpreted as ISO 8859-1 and saved like so
+ ( # Fix UTF-8 ê that was interpreted as ISO 8859-1
re.compile("ê"),
r"ê",
),
- ( # Fix UTF-8 ê that was interpreted as ISO 8859-1 and saved like so
+ ( # Fix UTF-8 ê that was interpreted as ISO 8859-1
re.compile("ô"),
r"ô",
),
- ( # Fix UTF-8 î that was interpreted as ISO 8859-1 and saved like so
+ ( # Fix UTF-8 î that was interpreted as ISO 8859-1
re.compile("î"),
r"î",
),
- ( # Fix UTF-8 ï that was interpreted as ISO 8859-1 and saved like so
+ ( # Fix UTF-8 ï that was interpreted as ISO 8859-1
re.compile("ï"),
r"ï",
),
- ( # Fix UTF-8 ö that was interpreted as ISO 8859-1 and saved like so
+ ( # Fix UTF-8 ö that was interpreted as ISO 8859-1
re.compile("ö"),
r"ö",
),
- ( # Fix UTF-8 ö that was interpreted as ISO 8859-1 and saved like so
+ ( # Fix UTF-8 ö that was interpreted as ISO 8859-1
re.compile("ü"),
r"ü",
),
- ( # Fix UTF-8 é that was interpreted as ISO 8859-1 and saved like so
+ ( # Fix UTF-8 é that was interpreted as ISO 8859-1
re.compile("à"),
r"à",
),
- ( # Fix UTF-8 … that was interpreted as ISO 8859-1 and saved like so
+ ( # Fix UTF-8 … that was interpreted as ISO 8859-1
re.compile("…"),
r"…",
),
- ( # Fix UTF-8 “ that was interpreted as ISO 8859-1 and saved like so
+ ( # Fix UTF-8 “ that was interpreted as ISO 8859-1
re.compile("“"),
r"“",
),
- ( # Fix UTF-8 ” that was interpreted as ISO 8859-1 and saved like so
+ ( # Fix UTF-8 ” that was interpreted as ISO 8859-1
re.compile("â€\u009d"),
r"”",
),
- ( # Fix UTF-8 – that was interpreted as ISO 8859-1 and saved like so
+ ( # Fix UTF-8 – that was interpreted as ISO 8859-1
re.compile("–"),
r"–",
),
- ( # Fix UTF-8 – that was interpreted as ISO 8859-1 and saved like so
+ ( # Fix UTF-8 – that was interpreted as ISO 8859-1
re.compile("—"),
r"—",
),
- ( # Fix UTF-8 − that was interpreted as ISO 8859-1 and saved like so
+ ( # Fix UTF-8 − that was interpreted as ISO 8859-1
re.compile("â€\u0090"),
r"−",
),
- ( # Fix UTF-8 • that was interpreted as ISO 8859-1 and saved like so
+ ( # Fix UTF-8 • that was interpreted as ISO 8859-1
re.compile("•"),
r"•",
),
- ( # Fix UTF-8 † that was interpreted as ISO 8859-1 and saved like so
- re.compile("†"),
- r"† ",
+ ( # Fix UTF-8 í that was interpreted as ISO 8859-1
+ re.compile("iÌ\u0081"),
+ r"í",
),
- ## WARNING unknown or not sure
- ( # Fix UTF-8 é that was interpreted as ISO 8859-1 and saved like so
+ # WARNING not sure
+ ( # Fix UTF-8 é that was interpreted as ISO 8859-1
re.compile("eÌ "),
r"é",
),
- ( # Delete unknown 

- re.compile("
"),
- r"",
- ),
- ( # Delete unknown Ì\u0081
- re.compile("Ì\u0081"),
- r"",
+ ( # Fix UTF-8 † that was interpreted as ISO 8859-1
+ re.compile("†"),
+ r"† ",
),
)
+## WARNING unknown broken encoding
+unknownIso = (re.compile(r"\w*
.*\r?\n"),) # unknown 
 + surroundings
+
def convert(markup):
- for spip, markdown in mappings:
+ for spip, markdown in spipToMarkdown:
markup = spip.sub(markdown, markup)
- # return markup.encode("utf-8").decode("utf-8")
+ for iso, utf in isoToUtf:
+ markup = iso.sub(utf, markup)
+ for iso in unknownIso:
+ for match in iso.finditer(markup):
+ print(f" UNKNOWN CHARACTER {match.group()}")
return markup