more strict cleaning of metadata
This commit is contained in:
parent
d8b7a1b562
commit
b3119924a8
@ -1,5 +1,5 @@
|
||||
import yaml
|
||||
from convert import convert
|
||||
from convert import convertMeta
|
||||
from slugify import slugify
|
||||
from SpipDatabase import *
|
||||
|
||||
@ -8,10 +8,10 @@ class metadata:
|
||||
def __init__(self, article):
|
||||
self.id = article.id_article
|
||||
# self.surtitle = article.surtitre # Probably unused
|
||||
self.title = convert(article.titre)
|
||||
self.title = convertMeta(article.titre)
|
||||
self.subtitle = article.soustitre # Probably unused
|
||||
# self.section = article.id_rubrique # TODO join
|
||||
self.description = convert(article.descriptif)
|
||||
self.description = convertMeta(article.descriptif)
|
||||
self.caption = article.chapo # Probably unused
|
||||
self.ps = article.ps # Probably unused
|
||||
self.publicationDate = article.date
|
||||
|
@ -20,10 +20,18 @@ spipToMarkdown = (
|
||||
re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I),
|
||||
r"**\1**",
|
||||
),
|
||||
( # html strong
|
||||
re.compile(r"<strong> *(.*?) *</strong>", re.S | re.I),
|
||||
r"**\1**",
|
||||
),
|
||||
( # emphasis
|
||||
re.compile(r"\{ *(.*?) *\}", re.S | re.I),
|
||||
r"*\1*",
|
||||
),
|
||||
( # html emphasis
|
||||
re.compile(r"<i> *(.*?) *<\/i>", re.S | re.I),
|
||||
r"*\1*",
|
||||
),
|
||||
( # strikethrough
|
||||
re.compile(
|
||||
r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
|
||||
@ -94,13 +102,58 @@ spipToMarkdown = (
|
||||
),
|
||||
( # Keep only the first language in multi-language blocks
|
||||
re.compile(
|
||||
r"<multi>\s*\[.{2,4}\]\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
|
||||
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
|
||||
re.S | re.I,
|
||||
),
|
||||
r"\1",
|
||||
),
|
||||
)
|
||||
|
||||
spipToMetadata = (
|
||||
( # strong
|
||||
re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I),
|
||||
r"\1",
|
||||
),
|
||||
( # html strong
|
||||
re.compile(r"<strong> *(.*?) *</strong>", re.S | re.I),
|
||||
r"\1",
|
||||
),
|
||||
( # emphasis
|
||||
re.compile(r"\{ *(.*?) *\}", re.S | re.I),
|
||||
r"\1",
|
||||
),
|
||||
( # html emphasis
|
||||
re.compile(r"<i> *(.*?) *<\/i>", re.S | re.I),
|
||||
r"\1",
|
||||
),
|
||||
( # strikethrough
|
||||
re.compile(
|
||||
r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
|
||||
re.S | re.I,
|
||||
),
|
||||
r"\1",
|
||||
),
|
||||
( # Keep only the first language in multi-language blocks
|
||||
re.compile(
|
||||
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
|
||||
re.S | re.I,
|
||||
),
|
||||
r"\1",
|
||||
),
|
||||
( # remove every tag
|
||||
re.compile(r"<\/?.*?> *", re.S | re.I),
|
||||
r"",
|
||||
),
|
||||
( # beginning with angle bracket(s)
|
||||
re.compile(r"^>+ +", re.S | re.I),
|
||||
r"",
|
||||
),
|
||||
( # beginning with a number followed by a dot
|
||||
re.compile(r"^\d+\. +", re.S | re.I),
|
||||
r"",
|
||||
),
|
||||
)
|
||||
|
||||
isoToUtf = (
|
||||
# Broken encoding
|
||||
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
|
||||
@ -203,3 +256,14 @@ def convert(markup):
|
||||
for match in iso.finditer(markup):
|
||||
print(f" UNKNOWN CHARACTER {match.group()}")
|
||||
return markup
|
||||
|
||||
|
||||
def convertMeta(markup):
|
||||
for spip, metadata in spipToMetadata:
|
||||
markup = spip.sub(metadata, markup)
|
||||
for iso, utf in isoToUtf:
|
||||
markup = iso.sub(utf, markup)
|
||||
for iso in unknownIso:
|
||||
for match in iso.finditer(markup):
|
||||
print(f" UNKNOWN CHARACTER {match.group()}")
|
||||
return markup
|
||||
|
Loading…
Reference in New Issue
Block a user