diff --git a/spip2md/Metadata.py b/spip2md/Metadata.py
index 7f8fe3e..348b454 100644
--- a/spip2md/Metadata.py
+++ b/spip2md/Metadata.py
@@ -1,5 +1,5 @@
import yaml
-from convert import convert
+from convert import convertMeta
from slugify import slugify
from SpipDatabase import *
@@ -8,10 +8,10 @@ class metadata:
def __init__(self, article):
self.id = article.id_article
# self.surtitle = article.surtitre # Probably unused
- self.title = convert(article.titre)
+ self.title = convertMeta(article.titre)
self.subtitle = article.soustitre # Probably unused
# self.section = article.id_rubrique # TODO join
- self.description = convert(article.descriptif)
+ self.description = convertMeta(article.descriptif)
self.caption = article.chapo # Probably unused
self.ps = article.ps # Probably unused
self.publicationDate = article.date
diff --git a/spip2md/convert.py b/spip2md/convert.py
index 4497222..2144bc9 100644
--- a/spip2md/convert.py
+++ b/spip2md/convert.py
@@ -20,10 +20,18 @@ spipToMarkdown = (
re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I),
r"**\1**",
),
+ ( # html strong
+ re.compile(r" *(.*?) *", re.S | re.I),
+ r"**\1**",
+ ),
( # emphasis
re.compile(r"\{ *(.*?) *\}", re.S | re.I),
r"*\1*",
),
+ ( # html emphasis
+ re.compile(r" *(.*?) *<\/i>", re.S | re.I),
+ r"*\1*",
+ ),
( # strikethrough
re.compile(
r"\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
@@ -94,13 +102,58 @@ spipToMarkdown = (
),
( # Keep only the first language in multi-language blocks
re.compile(
- r"\s*\[.{2,4}\]\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
+ r"\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
re.S | re.I,
),
r"\1",
),
)
+spipToMetadata = (
+ ( # strong
+ re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I),
+ r"\1",
+ ),
+ ( # html strong
+ re.compile(r" *(.*?) *", re.S | re.I),
+ r"\1",
+ ),
+ ( # emphasis
+ re.compile(r"\{ *(.*?) *\}", re.S | re.I),
+ r"\1",
+ ),
+ ( # html emphasis
+ re.compile(r" *(.*?) *<\/i>", re.S | re.I),
+ r"\1",
+ ),
+ ( # strikethrough
+ re.compile(
+ r"\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
+ re.S | re.I,
+ ),
+ r"\1",
+ ),
+ ( # Keep only the first language in multi-language blocks
+ re.compile(
+ r"\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
+ re.S | re.I,
+ ),
+ r"\1",
+ ),
+ ( # remove every tag
+ re.compile(r"<\/?.*?> *", re.S | re.I),
+ r"",
+ ),
+ ( # beginning with angle bracket(s)
+ re.compile(r"^>+ +", re.S | re.I),
+ r"",
+ ),
+ ( # beginning with a number followed by a dot
+ re.compile(r"^\d+\. +", re.S | re.I),
+ r"",
+ ),
+)
+
isoToUtf = (
# Broken encoding
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
@@ -203,3 +256,14 @@ def convert(markup):
for match in iso.finditer(markup):
print(f" UNKNOWN CHARACTER {match.group()}")
return markup
+
+
+def convertMeta(markup):
+ for spip, metadata in spipToMetadata:
+ markup = spip.sub(metadata, markup)
+ for iso, utf in isoToUtf:
+ markup = iso.sub(utf, markup)
+ for iso in unknownIso:
+ for match in iso.finditer(markup):
+ print(f" UNKNOWN CHARACTER {match.group()}")
+ return markup