more strict cleaning of metadata

This commit is contained in:
Guilhem Fauré 2023-05-11 15:17:44 +02:00
parent d8b7a1b562
commit b3119924a8
2 changed files with 68 additions and 4 deletions

View File

@ -1,5 +1,5 @@
import yaml import yaml
from convert import convert from convert import convertMeta
from slugify import slugify from slugify import slugify
from SpipDatabase import * from SpipDatabase import *
@ -8,10 +8,10 @@ class metadata:
def __init__(self, article): def __init__(self, article):
self.id = article.id_article self.id = article.id_article
# self.surtitle = article.surtitre # Probably unused # self.surtitle = article.surtitre # Probably unused
self.title = convert(article.titre) self.title = convertMeta(article.titre)
self.subtitle = article.soustitre # Probably unused self.subtitle = article.soustitre # Probably unused
# self.section = article.id_rubrique # TODO join # self.section = article.id_rubrique # TODO join
self.description = convert(article.descriptif) self.description = convertMeta(article.descriptif)
self.caption = article.chapo # Probably unused self.caption = article.chapo # Probably unused
self.ps = article.ps # Probably unused self.ps = article.ps # Probably unused
self.publicationDate = article.date self.publicationDate = article.date

View File

@ -20,10 +20,18 @@ spipToMarkdown = (
re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I), re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I),
r"**\1**", r"**\1**",
), ),
( # html strong
re.compile(r"<strong> *(.*?) *</strong>", re.S | re.I),
r"**\1**",
),
( # emphasis ( # emphasis
re.compile(r"\{ *(.*?) *\}", re.S | re.I), re.compile(r"\{ *(.*?) *\}", re.S | re.I),
r"*\1*", r"*\1*",
), ),
( # html emphasis
re.compile(r"<i> *(.*?) *<\/i>", re.S | re.I),
r"*\1*",
),
( # strikethrough ( # strikethrough
re.compile( re.compile(
r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)", r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
@ -94,13 +102,58 @@ spipToMarkdown = (
), ),
( # Keep only the first language in multi-language blocks ( # Keep only the first language in multi-language blocks
re.compile( re.compile(
r"<multi>\s*\[.{2,4}\]\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>", r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
re.S | re.I, re.S | re.I,
), ),
r"\1", r"\1",
), ),
) )
spipToMetadata = (
( # strong
re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I),
r"\1",
),
( # html strong
re.compile(r"<strong> *(.*?) *</strong>", re.S | re.I),
r"\1",
),
( # emphasis
re.compile(r"\{ *(.*?) *\}", re.S | re.I),
r"\1",
),
( # html emphasis
re.compile(r"<i> *(.*?) *<\/i>", re.S | re.I),
r"\1",
),
( # strikethrough
re.compile(
r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
re.S | re.I,
),
r"\1",
),
( # Keep only the first language in multi-language blocks
re.compile(
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
re.S | re.I,
),
r"\1",
),
( # remove every tag
re.compile(r"<\/?.*?> *", re.S | re.I),
r"",
),
( # beginning with angle bracket(s)
re.compile(r"^>+ +", re.S | re.I),
r"",
),
( # beginning with a number followed by a dot
re.compile(r"^\d+\. +", re.S | re.I),
r"",
),
)
isoToUtf = ( isoToUtf = (
# Broken encoding # Broken encoding
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1 ( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
@ -203,3 +256,14 @@ def convert(markup):
for match in iso.finditer(markup): for match in iso.finditer(markup):
print(f" UNKNOWN CHARACTER {match.group()}") print(f" UNKNOWN CHARACTER {match.group()}")
return markup return markup
def convertMeta(markup):
for spip, metadata in spipToMetadata:
markup = spip.sub(metadata, markup)
for iso, utf in isoToUtf:
markup = iso.sub(utf, markup)
for iso in unknownIso:
for match in iso.finditer(markup):
print(f" UNKNOWN CHARACTER {match.group()}")
return markup