more strict cleaning of metadata

2023-05-11 15:17:44 +02:00 · 2023-05-11 15:17:44 +02:00 · b3119924a8
commit b3119924a8
parent d8b7a1b562
2 changed files with 68 additions and 4 deletions
--- a/spip2md/Metadata.py
+++ b/spip2md/Metadata.py
@ -1,5 +1,5 @@
 import yaml
-from convert import convert
+from convert import convertMeta
 from slugify import slugify
 from SpipDatabase import *
@ -8,10 +8,10 @@ class metadata:
    def __init__(self, article):
        self.id = article.id_article
        # self.surtitle = article.surtitre  # Probably unused
-        self.title = convert(article.titre)
+        self.title = convertMeta(article.titre)
        self.subtitle = article.soustitre  # Probably unused
        # self.section = article.id_rubrique # TODO join
-        self.description = convert(article.descriptif)
+        self.description = convertMeta(article.descriptif)
        self.caption = article.chapo  # Probably unused
        self.ps = article.ps  # Probably unused
        self.publicationDate = article.date
--- a/spip2md/convert.py
+++ b/spip2md/convert.py
@ -20,10 +20,18 @@ spipToMarkdown = (
        re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I),
        r"**\1**",
    ),
    (  # html strong
        re.compile(r"<strong> *(.*?) *</strong>", re.S | re.I),
        r"**\1**",
    ),
    (  # emphasis
        re.compile(r"\{ *(.*?) *\}", re.S | re.I),
        r"*\1*",
    ),
    (  # html emphasis
        re.compile(r"<i> *(.*?) *<\/i>", re.S | re.I),
        r"*\1*",
    ),
    (  # strikethrough
        re.compile(
            r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
@ -94,13 +102,58 @@ spipToMarkdown = (
    ),
    (  # Keep only the first language in multi-language blocks
        re.compile(
-            r"<multi>\s*\[.{2,4}\]\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
+            r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
            re.S | re.I,
        ),
        r"\1",
    ),
 )
 spipToMetadata = (
    (  # strong
        re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I),
        r"\1",
    ),
    (  # html strong
        re.compile(r"<strong> *(.*?) *</strong>", re.S | re.I),
        r"\1",
    ),
    (  # emphasis
        re.compile(r"\{ *(.*?) *\}", re.S | re.I),
        r"\1",
    ),
    (  # html emphasis
        re.compile(r"<i> *(.*?) *<\/i>", re.S | re.I),
        r"\1",
    ),
    (  # strikethrough
        re.compile(
            r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
            re.S | re.I,
        ),
        r"\1",
    ),
    (  # Keep only the first language in multi-language blocks
        re.compile(
            r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
            re.S | re.I,
        ),
        r"\1",
    ),
    (  # remove every tag
        re.compile(r"<\/?.*?> *", re.S | re.I),
        r"",
    ),
    (  # beginning with angle bracket(s)
        re.compile(r"^>+ +", re.S | re.I),
        r"",
    ),
    (  # beginning with a number followed by a dot
        re.compile(r"^\d+\. +", re.S | re.I),
        r"",
    ),
 )
 isoToUtf = (
    # Broken encoding
    (  # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
@ -203,3 +256,14 @@ def convert(markup):
        for match in iso.finditer(markup):
            print(f"    UNKNOWN CHARACTER {match.group()}")
    return markup
 def convertMeta(markup):
    for spip, metadata in spipToMetadata:
        markup = spip.sub(metadata, markup)
    for iso, utf in isoToUtf:
        markup = iso.sub(utf, markup)
    for iso in unknownIso:
        for match in iso.finditer(markup):
            print(f"    UNKNOWN CHARACTER {match.group()}")
    return markup