From b3119924a89894e05aaf5c2ed9089086cfe30604 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilhem=20Faur=C3=A9?= Date: Thu, 11 May 2023 15:17:44 +0200 Subject: [PATCH] more strict cleaning of metadata --- spip2md/Metadata.py | 6 ++--- spip2md/convert.py | 66 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 4 deletions(-) diff --git a/spip2md/Metadata.py b/spip2md/Metadata.py index 7f8fe3e..348b454 100644 --- a/spip2md/Metadata.py +++ b/spip2md/Metadata.py @@ -1,5 +1,5 @@ import yaml -from convert import convert +from convert import convertMeta from slugify import slugify from SpipDatabase import * @@ -8,10 +8,10 @@ class metadata: def __init__(self, article): self.id = article.id_article # self.surtitle = article.surtitre # Probably unused - self.title = convert(article.titre) + self.title = convertMeta(article.titre) self.subtitle = article.soustitre # Probably unused # self.section = article.id_rubrique # TODO join - self.description = convert(article.descriptif) + self.description = convertMeta(article.descriptif) self.caption = article.chapo # Probably unused self.ps = article.ps # Probably unused self.publicationDate = article.date diff --git a/spip2md/convert.py b/spip2md/convert.py index 4497222..2144bc9 100644 --- a/spip2md/convert.py +++ b/spip2md/convert.py @@ -20,10 +20,18 @@ spipToMarkdown = ( re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I), r"**\1**", ), + ( # html strong + re.compile(r" *(.*?) *", re.S | re.I), + r"**\1**", + ), ( # emphasis re.compile(r"\{ *(.*?) *\}", re.S | re.I), r"*\1*", ), + ( # html emphasis + re.compile(r" *(.*?) *<\/i>", re.S | re.I), + r"*\1*", + ), ( # strikethrough re.compile( r"\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)", @@ -94,13 +102,58 @@ spipToMarkdown = ( ), ( # Keep only the first language in multi-language blocks re.compile( - r"\s*\[.{2,4}\]\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>", + r"\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>", re.S | re.I, ), r"\1", ), ) +spipToMetadata = ( + ( # strong + re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I), + r"\1", + ), + ( # html strong + re.compile(r" *(.*?) *", re.S | re.I), + r"\1", + ), + ( # emphasis + re.compile(r"\{ *(.*?) *\}", re.S | re.I), + r"\1", + ), + ( # html emphasis + re.compile(r" *(.*?) *<\/i>", re.S | re.I), + r"\1", + ), + ( # strikethrough + re.compile( + r"\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)", + re.S | re.I, + ), + r"\1", + ), + ( # Keep only the first language in multi-language blocks + re.compile( + r"\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>", + re.S | re.I, + ), + r"\1", + ), + ( # remove every tag + re.compile(r"<\/?.*?> *", re.S | re.I), + r"", + ), + ( # beginning with angle bracket(s) + re.compile(r"^>+ +", re.S | re.I), + r"", + ), + ( # beginning with a number followed by a dot + re.compile(r"^\d+\. +", re.S | re.I), + r"", + ), +) + isoToUtf = ( # Broken encoding ( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1 @@ -203,3 +256,14 @@ def convert(markup): for match in iso.finditer(markup): print(f" UNKNOWN CHARACTER {match.group()}") return markup + + +def convertMeta(markup): + for spip, metadata in spipToMetadata: + markup = spip.sub(metadata, markup) + for iso, utf in isoToUtf: + markup = iso.sub(utf, markup) + for iso in unknownIso: + for match in iso.finditer(markup): + print(f" UNKNOWN CHARACTER {match.group()}") + return markup