From b3119924a89894e05aaf5c2ed9089086cfe30604 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guilhem=20Faur=C3=A9?= <pro@gfaure.eu>
Date: Thu, 11 May 2023 15:17:44 +0200
Subject: [PATCH] more strict cleaning of metadata

---
 spip2md/Metadata.py |  6 ++---
 spip2md/convert.py  | 66 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 68 insertions(+), 4 deletions(-)
diff --git a/spip2md/Metadata.py b/spip2md/Metadata.py
index 7f8fe3e..348b454 100644
--- a/spip2md/Metadata.py
+++ b/spip2md/Metadata.py
@@ -1,5 +1,5 @@
 import yaml
-from convert import convert
+from convert import convertMeta
 from slugify import slugify
 from SpipDatabase import *
 
@@ -8,10 +8,10 @@ class metadata:
     def __init__(self, article):
         self.id = article.id_article
         # self.surtitle = article.surtitre  # Probably unused
-        self.title = convert(article.titre)
+        self.title = convertMeta(article.titre)
         self.subtitle = article.soustitre  # Probably unused
         # self.section = article.id_rubrique # TODO join
-        self.description = convert(article.descriptif)
+        self.description = convertMeta(article.descriptif)
         self.caption = article.chapo  # Probably unused
         self.ps = article.ps  # Probably unused
         self.publicationDate = article.date
diff --git a/spip2md/convert.py b/spip2md/convert.py
index 4497222..2144bc9 100644
--- a/spip2md/convert.py
+++ b/spip2md/convert.py
@@ -20,10 +20,18 @@ spipToMarkdown = (
         re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I),
         r"**\1**",
     ),
+    (  # html strong
+        re.compile(r"<strong> *(.*?) *</strong>", re.S | re.I),
+        r"**\1**",
+    ),
     (  # emphasis
         re.compile(r"\{ *(.*?) *\}", re.S | re.I),
         r"*\1*",
     ),
+    (  # html emphasis
+        re.compile(r"<i> *(.*?) *<\/i>", re.S | re.I),
+        r"*\1*",
+    ),
     (  # strikethrough
         re.compile(
             r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
@@ -94,13 +102,58 @@ spipToMarkdown = (
     ),
     (  # Keep only the first language in multi-language blocks
         re.compile(
-            r"<multi>\s*\[.{2,4}\]\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
+            r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
             re.S | re.I,
         ),
         r"\1",
     ),
 )
 
+spipToMetadata = (
+    (  # strong
+        re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I),
+        r"\1",
+    ),
+    (  # html strong
+        re.compile(r"<strong> *(.*?) *</strong>", re.S | re.I),
+        r"\1",
+    ),
+    (  # emphasis
+        re.compile(r"\{ *(.*?) *\}", re.S | re.I),
+        r"\1",
+    ),
+    (  # html emphasis
+        re.compile(r"<i> *(.*?) *<\/i>", re.S | re.I),
+        r"\1",
+    ),
+    (  # strikethrough
+        re.compile(
+            r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
+            re.S | re.I,
+        ),
+        r"\1",
+    ),
+    (  # Keep only the first language in multi-language blocks
+        re.compile(
+            r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
+            re.S | re.I,
+        ),
+        r"\1",
+    ),
+    (  # remove every tag
+        re.compile(r"<\/?.*?> *", re.S | re.I),
+        r"",
+    ),
+    (  # beginning with angle bracket(s)
+        re.compile(r"^>+ +", re.S | re.I),
+        r"",
+    ),
+    (  # beginning with a number followed by a dot
+        re.compile(r"^\d+\. +", re.S | re.I),
+        r"",
+    ),
+)
+
 isoToUtf = (
     # Broken encoding
     (  # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
@@ -203,3 +256,14 @@ def convert(markup):
         for match in iso.finditer(markup):
             print(f"    UNKNOWN CHARACTER {match.group()}")
     return markup
+
+
+def convertMeta(markup):
+    for spip, metadata in spipToMetadata:
+        markup = spip.sub(metadata, markup)
+    for iso, utf in isoToUtf:
+        markup = iso.sub(utf, markup)
+    for iso in unknownIso:
+        for match in iso.finditer(markup):
+            print(f"    UNKNOWN CHARACTER {match.group()}")
+    return markup