more strict cleaning of metadata
This commit is contained in:
parent
d8b7a1b562
commit
b3119924a8
@ -1,5 +1,5 @@
|
|||||||
import yaml
|
import yaml
|
||||||
from convert import convert
|
from convert import convertMeta
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
from SpipDatabase import *
|
from SpipDatabase import *
|
||||||
|
|
||||||
@ -8,10 +8,10 @@ class metadata:
|
|||||||
def __init__(self, article):
|
def __init__(self, article):
|
||||||
self.id = article.id_article
|
self.id = article.id_article
|
||||||
# self.surtitle = article.surtitre # Probably unused
|
# self.surtitle = article.surtitre # Probably unused
|
||||||
self.title = convert(article.titre)
|
self.title = convertMeta(article.titre)
|
||||||
self.subtitle = article.soustitre # Probably unused
|
self.subtitle = article.soustitre # Probably unused
|
||||||
# self.section = article.id_rubrique # TODO join
|
# self.section = article.id_rubrique # TODO join
|
||||||
self.description = convert(article.descriptif)
|
self.description = convertMeta(article.descriptif)
|
||||||
self.caption = article.chapo # Probably unused
|
self.caption = article.chapo # Probably unused
|
||||||
self.ps = article.ps # Probably unused
|
self.ps = article.ps # Probably unused
|
||||||
self.publicationDate = article.date
|
self.publicationDate = article.date
|
||||||
|
@ -20,10 +20,18 @@ spipToMarkdown = (
|
|||||||
re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I),
|
re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I),
|
||||||
r"**\1**",
|
r"**\1**",
|
||||||
),
|
),
|
||||||
|
( # html strong
|
||||||
|
re.compile(r"<strong> *(.*?) *</strong>", re.S | re.I),
|
||||||
|
r"**\1**",
|
||||||
|
),
|
||||||
( # emphasis
|
( # emphasis
|
||||||
re.compile(r"\{ *(.*?) *\}", re.S | re.I),
|
re.compile(r"\{ *(.*?) *\}", re.S | re.I),
|
||||||
r"*\1*",
|
r"*\1*",
|
||||||
),
|
),
|
||||||
|
( # html emphasis
|
||||||
|
re.compile(r"<i> *(.*?) *<\/i>", re.S | re.I),
|
||||||
|
r"*\1*",
|
||||||
|
),
|
||||||
( # strikethrough
|
( # strikethrough
|
||||||
re.compile(
|
re.compile(
|
||||||
r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
|
r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
|
||||||
@ -94,13 +102,58 @@ spipToMarkdown = (
|
|||||||
),
|
),
|
||||||
( # Keep only the first language in multi-language blocks
|
( # Keep only the first language in multi-language blocks
|
||||||
re.compile(
|
re.compile(
|
||||||
r"<multi>\s*\[.{2,4}\]\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
|
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
|
||||||
re.S | re.I,
|
re.S | re.I,
|
||||||
),
|
),
|
||||||
r"\1",
|
r"\1",
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
spipToMetadata = (
|
||||||
|
( # strong
|
||||||
|
re.compile(r"\{\{ *(.*?) *\}\}", re.S | re.I),
|
||||||
|
r"\1",
|
||||||
|
),
|
||||||
|
( # html strong
|
||||||
|
re.compile(r"<strong> *(.*?) *</strong>", re.S | re.I),
|
||||||
|
r"\1",
|
||||||
|
),
|
||||||
|
( # emphasis
|
||||||
|
re.compile(r"\{ *(.*?) *\}", re.S | re.I),
|
||||||
|
r"\1",
|
||||||
|
),
|
||||||
|
( # html emphasis
|
||||||
|
re.compile(r"<i> *(.*?) *<\/i>", re.S | re.I),
|
||||||
|
r"\1",
|
||||||
|
),
|
||||||
|
( # strikethrough
|
||||||
|
re.compile(
|
||||||
|
r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
|
||||||
|
re.S | re.I,
|
||||||
|
),
|
||||||
|
r"\1",
|
||||||
|
),
|
||||||
|
( # Keep only the first language in multi-language blocks
|
||||||
|
re.compile(
|
||||||
|
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
|
||||||
|
re.S | re.I,
|
||||||
|
),
|
||||||
|
r"\1",
|
||||||
|
),
|
||||||
|
( # remove every tag
|
||||||
|
re.compile(r"<\/?.*?> *", re.S | re.I),
|
||||||
|
r"",
|
||||||
|
),
|
||||||
|
( # beginning with angle bracket(s)
|
||||||
|
re.compile(r"^>+ +", re.S | re.I),
|
||||||
|
r"",
|
||||||
|
),
|
||||||
|
( # beginning with a number followed by a dot
|
||||||
|
re.compile(r"^\d+\. +", re.S | re.I),
|
||||||
|
r"",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
isoToUtf = (
|
isoToUtf = (
|
||||||
# Broken encoding
|
# Broken encoding
|
||||||
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
|
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
|
||||||
@ -203,3 +256,14 @@ def convert(markup):
|
|||||||
for match in iso.finditer(markup):
|
for match in iso.finditer(markup):
|
||||||
print(f" UNKNOWN CHARACTER {match.group()}")
|
print(f" UNKNOWN CHARACTER {match.group()}")
|
||||||
return markup
|
return markup
|
||||||
|
|
||||||
|
|
||||||
|
def convertMeta(markup):
|
||||||
|
for spip, metadata in spipToMetadata:
|
||||||
|
markup = spip.sub(metadata, markup)
|
||||||
|
for iso, utf in isoToUtf:
|
||||||
|
markup = iso.sub(utf, markup)
|
||||||
|
for iso in unknownIso:
|
||||||
|
for match in iso.finditer(markup):
|
||||||
|
print(f" UNKNOWN CHARACTER {match.group()}")
|
||||||
|
return markup
|
||||||
|
Loading…
Reference in New Issue
Block a user