fix document linking

This commit is contained in:
Guilhem Fauré 2023-05-30 12:46:17 +02:00
parent 9c79433f74
commit 27c281db90
2 changed files with 95 additions and 67 deletions

View File

@ -5,17 +5,17 @@ from re import I, S, compile
# ((SPIP syntax, Replacement Markdown syntax), …)
SPIP_MARKDOWN = (
( # horizontal rule
compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I),
compile(r"\r?\n?\r?\n?- ?- ?- ?- ?[\- ]*\r?\n?\r?\n?|<hr ?.*?>", I),
# r"---",
r"***",
"\n\n***\n\n",
),
( # line break
compile(r"\r?\n_ *(?=\r?\n)|<br ?.*?>", S | I),
"\n",
compile(r"\r?\n_ *(?=\r?\n)|<br ?.*?>", I),
"\n", # WARNING not the real translation
),
( # heading
compile(r"\{\{\{ *(.*?) *\}\}\}", S | I),
r"## \1", # Translate SPIP headings to h2
compile(r"\r?\n?\r?\n?\{\{\{ *(.*?) *\}\}\}\r?\n?\r?\n?", S | I),
"\n\n## \\1\n\n", # Translate SPIP headings to h2
),
( # strong
compile(r"\{\{ *(.*?) *\}\} ?", S | I),
@ -40,18 +40,18 @@ SPIP_MARKDOWN = (
),
r"~\1~",
),
( # images
compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I),
r"![](\2)", # Needs to be further processed to replace ID with filename
),
( # documents & embeds
compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I),
r"[](\2)", # Needs to be further processed to replace ID with filename
),
( # internal links
compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I),
r"[](\2)", # Needs to be further processed to replace ID with filename
),
# ( # images # processed by a specific function
# compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I),
# r"![](\2)",
# ),
# ( # documents & embeds # processed by a specific function
# compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I),
# r"[](\2)",
# ),
# ( # internal links # processed by a specific function
# compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I),
# r"[](\2)",
# ),
( # anchor
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
r"[\1](\2)",
@ -62,7 +62,7 @@ SPIP_MARKDOWN = (
),
( # footnote
compile(r"\[\[ *(.*?) *\]\]", S | I),
r"",
r"", # WARNING remove it
),
( # unordered list
compile(r"(\r?\n)-(?!#|-)\*? *", S | I),
@ -107,10 +107,35 @@ SPIP_MARKDOWN = (
),
)
# Match against documents ID found in links, ID can be inserted with .format()
# Name and path can be further replaced with .format()
DOCUMENT_LINK = r"(!)?\[(.*?)\]\(({})\)"
DOCUMENT_LINK_REPL = r"\1[\2{}]({})"
DOCUMENT_LINK = (
( # SPIP style documents & embeds links
compile(r"<()(?:doc|document|emb|embed)([0-9]+)(?:\|(.*?))?>", S | I),
r"[{}]({})",
),
( # Markdown style documents & embeds links
compile(r"\[(.*?)\]\((?:doc|document|emb|embed)([0-9]+)(?:\|(.*?))?\)", S | I),
r"[\1{}]({})",
),
( # SPIP style images links
compile(r"<()(?:img|image)([0-9]+)(?:\|(.*?))?>", S | I),
r"![{}]({})",
),
( # Markdown style images links
compile(r"\[(.*?)\]\((?:img|image)([0-9]+)(?:\|(.*?))?\)", S | I),
r"![\1{}]({})",
),
) # Name and path can be further replaced with .format()
ARTICLE_LINK = (
( # SPIP style documents & embeds links
compile(r"<()(?:art|article)([0-9]+)(?:\|(.*?))?>", S | I),
r"[{}]({})",
),
( # Markdown style internal links
compile(r"\[(.*?)\]\((?:art|article)([0-9]+)(?:\|(.*?))?\)", S | I),
r"[\1{}]({})",
),
) # Name and path can be further replaced with .format()
# Multi language block, to be further processed per lang
MULTILANG_BLOCK = compile(r"<multi>(.+?)<\/multi>", S | I)
@ -120,12 +145,12 @@ MULTILANGS = compile(
# WARNING probably useless text in metadata fields, to be removed
BLOAT = (
compile(r"^>+ +", S | I), # Remove beginning with angle bracket(s)
compile(r"^\d+\. +", S | I), # Remove beginning with a number followed by a dot
compile(r"^>+ +"), # Remove beginning with angle bracket(s)
compile(r"^\d+\. +"), # Remove beginning with a number followed by a dot
)
# Matches against every HTML tag
HTMLTAG = compile(r"<\/?.*?>\s*", S | I)
HTMLTAG = compile(r"<\/?.*?>\s*", S)
# ((Broken ISO 8859-1 encoding, Proper UTF equivalent encoding), …)

View File

@ -2,7 +2,7 @@
import logging
from os import makedirs, remove
from os.path import basename, splitext
from re import finditer, search, sub
from re import finditer, search
from shutil import copyfile
from typing import Any, Match, Optional
@ -20,9 +20,9 @@ from spip2md.database import (
SpipRubriques,
)
from spip2md.regexmap import (
ARTICLE_LINK,
BLOAT,
DOCUMENT_LINK,
DOCUMENT_LINK_REPL,
HTMLTAG,
ISO_UTF,
MULTILANG_BLOCK,
@ -72,14 +72,6 @@ class SpipWritable:
return MULTILANG_BLOCK.sub(replace_lang, text)
# Remove remaining HTML tags
@staticmethod
def clean_html(string: str) -> str:
if string is not None and len(string) > 0:
return HTMLTAG.sub("", string)
else:
return ""
# Apply different mappings to a text field, like SPIP to Markdown or encoding
def convert(self, text: Optional[str], clean_html: bool = True) -> str:
# Return unknown char surrounded by context_length chars
@ -108,7 +100,7 @@ class SpipWritable:
text = self.translate(text)
# Delete remaining HTML tags in body WARNING
if clean_html:
text = self.clean_html(text)
text = HTMLTAG.sub("", text)
# Warn about unknown chars
for char in UNKNOWN_ISO:
lastend: int = 0
@ -222,26 +214,55 @@ class SpipObject(SpipWritable):
descriptif: str
extra: str
def convert(self, text: Optional[str], clean_html: bool = True) -> str:
if text is not None and len(text) > 0:
for id_link, path_link in DOCUMENT_LINK:
for match in id_link.finditer(text):
doc: Document = Document.get(Document.id_document == match.group(2))
if doc is not None:
text = text.replace(
match.group(), path_link.format(doc.titre, doc.filename())
)
else:
logging.warn(
f"No document for link {match.group()} in {self.titre}"
)
text = text.replace(
match.group(), path_link.format("", "NOT FOUND")
)
for id_link, path_link in ARTICLE_LINK:
for match in id_link.finditer(text):
art: Article = Article.get(Article.id_article == match.group(2))
if art is not None:
text = text.replace(
match.group(),
path_link.format(
art.titre, f"{art.dir_slug()}/{art.filename()}"
),
)
else:
logging.warn(
f"No article for link {match.group()} in {self.titre}"
)
text = text.replace(
match.group(), path_link.format("", "NOT FOUND")
)
else:
return ""
return super().convert(text, clean_html)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Common fields that need conversions
self.texte = self.convert(self.texte)
self.extra: str = self.convert(self.extra)
self.statut: str = "false" if self.statut == "publie" else "true"
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
# Define file prefix (needs to be redefined for sections)
self.prefix = "index"
# Convert SPIP style internal links for images & other files into Markdown style
def link_documents(self, documents: ModelSelect) -> None:
for d in documents:
self.texte = sub(
DOCUMENT_LINK.format(d.id_document),
DOCUMENT_LINK_REPL.format(d.titre, d.filename()),
self.texte,
)
# Output related documents & link them in the text by the way
def documents(self, link_documents: bool = True) -> ModelSelect:
# Get related documents
def documents(self) -> ModelSelect:
documents = (
Document.select()
.join(
@ -250,23 +271,9 @@ class SpipObject(SpipWritable):
)
.where(SpipDocumentsLiens.id_objet == self.object_id)
)
if link_documents:
self.link_documents(documents)
return documents
# Convert SPIP style internal links for other articles or sections into Markdown
def link_articles(self) -> None:
for match in finditer(r"\[(.*?)]\((?:art|article)([0-9]+)\)", self.texte):
article = Article.get(Article.id_article == match.group(2))
if len(match.group(1)) > 0:
title: str = match.group(1)
else:
title: str = article.titre
self.texte = self.texte.replace(
match.group(0), f"[{title}]({article.dir_slug()}/{article.filename()})"
)
# Output related articles
# Get related articles
def articles(self) -> ModelSelect:
return (
Article.select()
@ -322,10 +329,6 @@ class SpipObject(SpipWritable):
# Write object to output destination
def write(self, parent_dir: str) -> str:
# Link articles
self.link_articles()
# Convert body after linking articles
self.texte = self.convert(self.texte)
# Define actual export directory
directory: str = parent_dir + self.dir_slug()
# Make a directory for this object if there isnt