spip2md/spip2md/converter.py
2023-05-23 13:40:32 +02:00

349 lines
8.7 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# pyright: strict
from re import I, S, compile, finditer, sub
from typing import Optional
# SPIP syntax to Markdown
spip_to_markdown = (
( # horizontal rule
compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I),
# r"---",
r"***",
),
( # line break
compile(r"\r?\n_ *(?=\r?\n)|<br ?.*?>", S | I),
"\n",
),
( # heading
compile(r"\{\{\{ *(.*?) *\}\}\}", S | I),
r"## \1", # Translate SPIP headings to h2
),
( # strong
compile(r"\{\{ *(.*?) *\}\} ?", S | I),
r"**\1** ",
),
( # html strong
compile(r"<strong> *(.*?) *</strong>", S | I),
r"**\1**",
),
( # emphasis
compile(r"\{ *(.*?) *\} ?", S | I),
r"*\1* ",
),
( # html emphasis
compile(r"<i> *(.*?) *<\/i>", S | I),
r"*\1*",
),
( # strikethrough
compile(
r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
S | I,
),
r"~\1~",
),
( # anchor
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
r"[\1](\2)",
),
( # wikilink
compile(r"\[\? *(.*?) *\]", S | I),
r"[\1](https://wikipedia.org/wiki/\1)",
),
( # footnote
compile(r"\[\[ *(.*?) *\]\]", S | I),
r"",
),
( # unordered list
compile(r"(\r?\n)-(?!#|-)\*? *", S | I),
r"\1- ",
),
( # wrong unordered list
compile(r"(\r?\n)\* +", S | I),
r"\1- ",
),
( # wrong unordered list WARNING suppresses preceding tag
compile(r"(\r?\n)<.*?>\* +", I),
r"\1- ",
),
( # ordered-list
compile(r"(\r?\n)-# *", S | I),
r"\g<1>1. ",
),
( # table-metadata
compile(r"(\r?\n)\|\|(.*?)\|(.*?)\|\|", S | I),
r"", # Remove it
),
( # quote
compile(
r"<(?:quote|poesie)>\s*(.*?)\s*(?:(\r?\n){2,}|<\/(?:quote|poesie)>)",
S | I,
),
r"> \1\2\2",
),
( # box
compile(
r"<code>\s*(.*?)\s*(?:(?:\r?\n){2,}|<\/code>)",
S | I,
),
"`\\1`",
),
( # fence
compile(
r"<cadre>\s*(.*?)\s*(?:(?:\r?\n){2,}|<\/cadre>)",
S | I,
),
"```\n\\1\n\n```",
),
( # WARNING Keep only the first language in multi-language blocks
compile(
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
S | I,
),
r"\1",
),
)
spip_to_text = (
( # strong
compile(r"\{\{ *(.*?) *\}\}", S | I),
r"\1",
),
( # html strong
compile(r"<strong> *(.*?) *</strong>", S | I),
r"\1",
),
( # emphasis
compile(r"\{ *(.*?) *\}", S | I),
r"\1",
),
( # html emphasis
compile(r"<i> *(.*?) *<\/i>", S | I),
r"\1",
),
( # strikethrough
compile(
r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
S | I,
),
r"\1",
),
( # Keep only the first language in multi-language blocks
compile(
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
S | I,
),
r"\1",
),
( # remove every html tag
compile(r"<\/?.*?> *", S | I),
r"",
),
( # Remove beginning with angle bracket(s)
compile(r"^>+ +", S | I),
r"",
),
( # Remove beginning with a number followed by a dot
compile(r"^\d+\. +", S | I),
r"",
),
)
# HTML tag WARNING can be used to remove them all
html_tag = compile(r"<\/?.*?> *", S | I)
# Broken ISO encoding to proper UTF-8
iso_to_utf = (
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
"’",
r"",
),
( # Fix UTF-8 † that was interpreted as ISO 8859-1
"‘",
r"",
),
( # Fix UTF-8 é that was interpreted as ISO 8859-1
"\u0081",
r"é",
),
( # Fix UTF-8 è that was interpreted as ISO 8859-1
"è",
r"è",
),
( # Fix UTF-8 ê that was interpreted as ISO 8859-1
"",
r"ê",
),
( # Fix UTF-8 ê that was interpreted as ISO 8859-1
"",
r"ô",
),
( # Fix UTF-8 î that was interpreted as ISO 8859-1
"",
r"î",
),
( # Fix UTF-8 ï that was interpreted as ISO 8859-1
"ˆ",
r"ï",
),
( # Fix UTF-8 ö that was interpreted as ISO 8859-1
"ˆ",
r"ö",
),
( # Fix UTF-8 ö that was interpreted as ISO 8859-1
"ˆ",
r"ü",
),
( # Fix UTF-8 é that was interpreted as ISO 8859-1
"à",
r"à",
),
( # Fix UTF-8 … that was interpreted as ISO 8859-1
"…",
r"",
),
( # Fix UTF-8 “ that was interpreted as ISO 8859-1
"“",
r"",
),
( # Fix UTF-8 ” that was interpreted as ISO 8859-1
"â€\u009d",
r"",
),
( # Fix UTF-8 that was interpreted as ISO 8859-1
"–",
r"",
),
( # Fix UTF-8 that was interpreted as ISO 8859-1
"—",
r"",
),
( # Fix UTF-8 that was interpreted as ISO 8859-1
"â€\u0090",
r"",
),
( # Fix UTF-8 • that was interpreted as ISO 8859-1
"•",
r"",
),
( # Fix UTF-8 ç that was interpreted as ISO 8859-1
"ç",
r"ç",
),
( # Fix UTF-8 î that was interpreted as ISO 8859-1
"î",
r"î",
),
( # Fix UTF-8 « that was interpreted as ISO 8859-1
"«",
r"«",
),
( # Fix UTF-8 » that was interpreted as ISO 8859-1
"»",
r"»",
),
( # Fix UTF-8 ° that was interpreted as ISO 8859-1
"°",
r"°",
),
( # Fix UTF-8 nbsp that was interpreted as ISO 8859-1
" ",
r" ",
),
( # Fix UTF-8 í that was interpreted as ISO 8859-1
"\u0081",
r"í",
),
# WARNING not sure
( # Fix UTF-8 é that was interpreted as ISO 8859-1
"",
r"é",
),
( # Fix UTF-8 † that was interpreted as ISO 8859-1
"†",
r"",
),
)
# WARNING unknown broken encoding
unknown_iso = (
r"
", # unknown 

r"∆", # unknown â^†
)
# Apply spip_to_markdown conversions to a text
def convert_body(text: Optional[str]) -> str:
if text is None:
return ""
for spip, markdown in spip_to_markdown:
text = spip.sub(markdown, text)
for iso, utf in iso_to_utf:
text = text.replace(iso, utf)
return text
# Apply spip_to_text conversions to a text
def convert_meta(text: Optional[str]) -> str:
if text is None:
return ""
for spip, metadata in spip_to_text:
text = spip.sub(metadata, text)
for iso, utf in iso_to_utf:
text = text.replace(iso, utf)
return text
# Replace images & documents in SPIP text with Markdown links with human-readable names
def convert_documents(text: str, documents: list[tuple[int, str, str]]) -> str:
for id, name, slug in documents:
text = sub(
r"<(?:img|image)" + str(id) + r"(\|.*?)*>",
f"![{name}]({slug})",
text,
)
text = sub(
r"<(?:doc|emb)" + str(id) + r"(\|.*?)*>",
f"[{name}]({slug})",
text,
)
text = sub(
r"\[(.*?)\]\((?:doc|emb)" + str(id) + r"(\|.*?)*\)",
f"[\\1]({slug})",
text,
)
return text
# Replace unknown chars with empty strings (delete them)
def remove_unknown_chars(text: str) -> str:
for char in unknown_iso:
text.replace(char, "")
return text
# Replace HTML tags chars with empty strings (delete them)
def remove_tags(text: str) -> str:
return html_tag.sub("", text)
# Return a list of tuples giving the start and end of unknown substring in text
def unknown_chars(text: str) -> list[tuple[int, int]]:
positions: list[tuple[int, int]] = []
for char in unknown_iso:
for match in finditer("(" + char + ")+", text):
positions.append((match.start(), match.end()))
return positions
# Return strings with unknown chards found in text, surrounded by context_length chars
def get_unknown_chars(text: str, context_length: int = 20) -> list[str]:
errors: list[str] = []
context: str = r".{0," + str(context_length) + r"}"
for char in unknown_iso:
matches = finditer(
context + r"(?=" + char + r")" + char + r".*?(?=\r?\n|$)",
text,
)
for match in matches:
errors.append(match.group())
return errors