regex replacing spip to markdown conversion

This commit is contained in:
Guilhem Fauré 2023-05-10 11:00:27 +02:00
parent 8a6026d129
commit cf2345e43e
2 changed files with 102 additions and 34 deletions

View File

@ -1,6 +1,3 @@
pyyaml pyyaml
python-slugify[unidecode] python-slugify[unidecode]
peewee peewee
# pyparsing
# lark

View File

@ -1,36 +1,109 @@
import re import re
from os import path from os import path
# from lark import Lark
# from pyparsing import Word, alphas
# larkParser = Lark(open(path.dirname(__file__) + "/spip.lark"))
class content: class content:
_mappings = ( _mappings = {
(re.compile(r"\{\{\{(.*?)\}\}\}", re.S | re.I), r"## \1"), "horizontal-rule": (
(re.compile(r"\{\{ \{(.*?)\} \}\}", re.S | re.I), r"***\1***"), re.compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", re.S | re.I),
(re.compile(r"\{ \{\{(.*?)\}\} \}", re.S | re.I), r"***\1***"), r"---",
(re.compile(r"\{\{(.*?)\}\}", re.S | re.I), r"**\1**"), ),
(re.compile(r"\{(.*?)\}", re.S | re.I), r"*\1*"), "line-break": (
) re.compile(r"\r?\n_ *(?=\r?\n)|<br ?.*?>", re.S | re.I),
"\n",
),
"heading": (
re.compile(r"\{\{\{ *(.*?) *\}\}\}", re.S | re.I),
r"## \1",
),
"strong": (
re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I),
r"**\1**",
),
"emphasis": (
re.compile(r"\{ *(.*?) *\}", re.S | re.I),
r"*\1*",
),
"strikethrough": (
re.compile(
r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
re.S | re.I,
),
r"~\1~",
),
"anchor": (
re.compile(r"\[ *(.*?) *-> *(.*?) *\]", re.S | re.I),
r"[\1](\2)",
),
"image": (
re.compile(r"<(?:img|image)(.*?)(\|.*?)*>", re.S | re.I),
r"![image](\1)",
),
"document-anchors": (
re.compile(r"<(?:doc|emb)(.*?)(\|.*?)*>", re.S | re.I),
r"[document](\1)",
),
"wikilink": (
re.compile(r"\[\? *(.*?) *\]", re.S | re.I),
r"[\1](https://wikipedia.org/wiki/\1)",
),
"footnote": (
re.compile(r"\[\[ *(.*?) *\]\]", re.S | re.I),
r"",
),
"unordered-list": (
re.compile(r"(\r?\n)-(?!#|-)\*? *", re.S | re.I),
r"\1- ",
),
"wrong-unordered-list": (
re.compile(r"(\r?\n)\* +", re.S | re.I),
r"\1- ",
),
"ordered-list": (
re.compile(r"(\r?\n)-# *", re.S | re.I),
r"\g<1>1. ",
),
"table-metadata": (
re.compile(r"(\r?\n)\|\|(.*?)\|(.*?)\|\|", re.S | re.I),
r"",
),
"quote": (
re.compile(
r"<(?:quote|poesie)>\s*(.*?)\s*(?:(\r?\n){2,}|<\/(?:quote|poesie)>)",
re.S | re.I,
),
r"> \1\2\2",
),
"box": (
re.compile(
r"<code>\s*(.*?)\s*(?:(?:\r?\n){2,}|<\/code>)",
re.S | re.I,
),
"`\\1`",
),
"fence": (
re.compile(
r"<cadre>\s*(.*?)\s*(?:(?:\r?\n){2,}|<\/cadre>)",
re.S | re.I,
),
"```\n\\1\n\n```",
),
"multi-language": ( # Keep only the first language
re.compile(
r"<multi>\s*\[.{2,4}\]\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
re.S | re.I,
),
r"\1",
),
}
def __init__(self, content): def __init__(self, spip):
self.spip = content self.markup = spip
def get_markdown(self): def get_markdown(self):
markdown = self.spip for spip, markdown in self._mappings.values():
for spip, md in self._mappings: self.markup = spip.sub(markdown, self.markup)
markdown = spip.sub(md, markdown) return self.markup
return markdown
# Parses the body & display parse tree
try:
print(f" parse tree :\n")
print(larkParser.parse(self.spip).pretty())
except Exception as e:
print(" PARSING FAILED :\n", e)
return markdown
# Parses a file & display its parse tree # Parses a file & display its parse tree
@ -40,14 +113,12 @@ def test(filename):
print(f"--- Conversion of {filename} ---\n\n") print(f"--- Conversion of {filename} ---\n\n")
c = content(raw) c = content(raw)
print(c.get_markdown()) print(c.get_markdown())
# print(f"--- Parse tree of {filename} ---\n\n")
# print(larkParser.parse(raw))
if __name__ == "__main__": if __name__ == "__main__":
# Test # Test
test("../test/0.spip") test("../test/0.spip")
# test("../test/1.spip") test("../test/1.spip")
# test("../test/2.spip") test("../test/2.spip")
# test("../test/3.spip") test("../test/3.spip")
# test("../test/4.spip") test("../test/4.spip")