fix document linking

This commit is contained in:
Guilhem Fauré 2023-05-30 12:46:17 +02:00
parent 9c79433f74
commit 27c281db90
2 changed files with 95 additions and 67 deletions

View File

@ -5,17 +5,17 @@ from re import I, S, compile
# ((SPIP syntax, Replacement Markdown syntax), …) # ((SPIP syntax, Replacement Markdown syntax), …)
SPIP_MARKDOWN = ( SPIP_MARKDOWN = (
( # horizontal rule ( # horizontal rule
compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I), compile(r"\r?\n?\r?\n?- ?- ?- ?- ?[\- ]*\r?\n?\r?\n?|<hr ?.*?>", I),
# r"---", # r"---",
r"***", "\n\n***\n\n",
), ),
( # line break ( # line break
compile(r"\r?\n_ *(?=\r?\n)|<br ?.*?>", S | I), compile(r"\r?\n_ *(?=\r?\n)|<br ?.*?>", I),
"\n", "\n", # WARNING not the real translation
), ),
( # heading ( # heading
compile(r"\{\{\{ *(.*?) *\}\}\}", S | I), compile(r"\r?\n?\r?\n?\{\{\{ *(.*?) *\}\}\}\r?\n?\r?\n?", S | I),
r"## \1", # Translate SPIP headings to h2 "\n\n## \\1\n\n", # Translate SPIP headings to h2
), ),
( # strong ( # strong
compile(r"\{\{ *(.*?) *\}\} ?", S | I), compile(r"\{\{ *(.*?) *\}\} ?", S | I),
@ -40,18 +40,18 @@ SPIP_MARKDOWN = (
), ),
r"~\1~", r"~\1~",
), ),
( # images # ( # images # processed by a specific function
compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I), # compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I),
r"![](\2)", # Needs to be further processed to replace ID with filename # r"![](\2)",
), # ),
( # documents & embeds # ( # documents & embeds # processed by a specific function
compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I), # compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I),
r"[](\2)", # Needs to be further processed to replace ID with filename # r"[](\2)",
), # ),
( # internal links # ( # internal links # processed by a specific function
compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I), # compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I),
r"[](\2)", # Needs to be further processed to replace ID with filename # r"[](\2)",
), # ),
( # anchor ( # anchor
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I), compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
r"[\1](\2)", r"[\1](\2)",
@ -62,7 +62,7 @@ SPIP_MARKDOWN = (
), ),
( # footnote ( # footnote
compile(r"\[\[ *(.*?) *\]\]", S | I), compile(r"\[\[ *(.*?) *\]\]", S | I),
r"", r"", # WARNING remove it
), ),
( # unordered list ( # unordered list
compile(r"(\r?\n)-(?!#|-)\*? *", S | I), compile(r"(\r?\n)-(?!#|-)\*? *", S | I),
@ -107,10 +107,35 @@ SPIP_MARKDOWN = (
), ),
) )
# Match against documents ID found in links, ID can be inserted with .format() DOCUMENT_LINK = (
# Name and path can be further replaced with .format() ( # SPIP style documents & embeds links
DOCUMENT_LINK = r"(!)?\[(.*?)\]\(({})\)" compile(r"<()(?:doc|document|emb|embed)([0-9]+)(?:\|(.*?))?>", S | I),
DOCUMENT_LINK_REPL = r"\1[\2{}]({})" r"[{}]({})",
),
( # Markdown style documents & embeds links
compile(r"\[(.*?)\]\((?:doc|document|emb|embed)([0-9]+)(?:\|(.*?))?\)", S | I),
r"[\1{}]({})",
),
( # SPIP style images links
compile(r"<()(?:img|image)([0-9]+)(?:\|(.*?))?>", S | I),
r"![{}]({})",
),
( # Markdown style images links
compile(r"\[(.*?)\]\((?:img|image)([0-9]+)(?:\|(.*?))?\)", S | I),
r"![\1{}]({})",
),
) # Name and path can be further replaced with .format()
ARTICLE_LINK = (
( # SPIP style documents & embeds links
compile(r"<()(?:art|article)([0-9]+)(?:\|(.*?))?>", S | I),
r"[{}]({})",
),
( # Markdown style internal links
compile(r"\[(.*?)\]\((?:art|article)([0-9]+)(?:\|(.*?))?\)", S | I),
r"[\1{}]({})",
),
) # Name and path can be further replaced with .format()
# Multi language block, to be further processed per lang # Multi language block, to be further processed per lang
MULTILANG_BLOCK = compile(r"<multi>(.+?)<\/multi>", S | I) MULTILANG_BLOCK = compile(r"<multi>(.+?)<\/multi>", S | I)
@ -120,12 +145,12 @@ MULTILANGS = compile(
# WARNING probably useless text in metadata fields, to be removed # WARNING probably useless text in metadata fields, to be removed
BLOAT = ( BLOAT = (
compile(r"^>+ +", S | I), # Remove beginning with angle bracket(s) compile(r"^>+ +"), # Remove beginning with angle bracket(s)
compile(r"^\d+\. +", S | I), # Remove beginning with a number followed by a dot compile(r"^\d+\. +"), # Remove beginning with a number followed by a dot
) )
# Matches against every HTML tag # Matches against every HTML tag
HTMLTAG = compile(r"<\/?.*?>\s*", S | I) HTMLTAG = compile(r"<\/?.*?>\s*", S)
# ((Broken ISO 8859-1 encoding, Proper UTF equivalent encoding), …) # ((Broken ISO 8859-1 encoding, Proper UTF equivalent encoding), …)

View File

@ -2,7 +2,7 @@
import logging import logging
from os import makedirs, remove from os import makedirs, remove
from os.path import basename, splitext from os.path import basename, splitext
from re import finditer, search, sub from re import finditer, search
from shutil import copyfile from shutil import copyfile
from typing import Any, Match, Optional from typing import Any, Match, Optional
@ -20,9 +20,9 @@ from spip2md.database import (
SpipRubriques, SpipRubriques,
) )
from spip2md.regexmap import ( from spip2md.regexmap import (
ARTICLE_LINK,
BLOAT, BLOAT,
DOCUMENT_LINK, DOCUMENT_LINK,
DOCUMENT_LINK_REPL,
HTMLTAG, HTMLTAG,
ISO_UTF, ISO_UTF,
MULTILANG_BLOCK, MULTILANG_BLOCK,
@ -72,14 +72,6 @@ class SpipWritable:
return MULTILANG_BLOCK.sub(replace_lang, text) return MULTILANG_BLOCK.sub(replace_lang, text)
# Remove remaining HTML tags
@staticmethod
def clean_html(string: str) -> str:
if string is not None and len(string) > 0:
return HTMLTAG.sub("", string)
else:
return ""
# Apply different mappings to a text field, like SPIP to Markdown or encoding # Apply different mappings to a text field, like SPIP to Markdown or encoding
def convert(self, text: Optional[str], clean_html: bool = True) -> str: def convert(self, text: Optional[str], clean_html: bool = True) -> str:
# Return unknown char surrounded by context_length chars # Return unknown char surrounded by context_length chars
@ -108,7 +100,7 @@ class SpipWritable:
text = self.translate(text) text = self.translate(text)
# Delete remaining HTML tags in body WARNING # Delete remaining HTML tags in body WARNING
if clean_html: if clean_html:
text = self.clean_html(text) text = HTMLTAG.sub("", text)
# Warn about unknown chars # Warn about unknown chars
for char in UNKNOWN_ISO: for char in UNKNOWN_ISO:
lastend: int = 0 lastend: int = 0
@ -222,26 +214,55 @@ class SpipObject(SpipWritable):
descriptif: str descriptif: str
extra: str extra: str
def convert(self, text: Optional[str], clean_html: bool = True) -> str:
if text is not None and len(text) > 0:
for id_link, path_link in DOCUMENT_LINK:
for match in id_link.finditer(text):
doc: Document = Document.get(Document.id_document == match.group(2))
if doc is not None:
text = text.replace(
match.group(), path_link.format(doc.titre, doc.filename())
)
else:
logging.warn(
f"No document for link {match.group()} in {self.titre}"
)
text = text.replace(
match.group(), path_link.format("", "NOT FOUND")
)
for id_link, path_link in ARTICLE_LINK:
for match in id_link.finditer(text):
art: Article = Article.get(Article.id_article == match.group(2))
if art is not None:
text = text.replace(
match.group(),
path_link.format(
art.titre, f"{art.dir_slug()}/{art.filename()}"
),
)
else:
logging.warn(
f"No article for link {match.group()} in {self.titre}"
)
text = text.replace(
match.group(), path_link.format("", "NOT FOUND")
)
else:
return ""
return super().convert(text, clean_html)
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
# Common fields that need conversions # Common fields that need conversions
self.texte = self.convert(self.texte)
self.extra: str = self.convert(self.extra) self.extra: str = self.convert(self.extra)
self.statut: str = "false" if self.statut == "publie" else "true" self.statut: str = "false" if self.statut == "publie" else "true"
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true" self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
# Define file prefix (needs to be redefined for sections) # Define file prefix (needs to be redefined for sections)
self.prefix = "index" self.prefix = "index"
# Convert SPIP style internal links for images & other files into Markdown style # Get related documents
def link_documents(self, documents: ModelSelect) -> None: def documents(self) -> ModelSelect:
for d in documents:
self.texte = sub(
DOCUMENT_LINK.format(d.id_document),
DOCUMENT_LINK_REPL.format(d.titre, d.filename()),
self.texte,
)
# Output related documents & link them in the text by the way
def documents(self, link_documents: bool = True) -> ModelSelect:
documents = ( documents = (
Document.select() Document.select()
.join( .join(
@ -250,23 +271,9 @@ class SpipObject(SpipWritable):
) )
.where(SpipDocumentsLiens.id_objet == self.object_id) .where(SpipDocumentsLiens.id_objet == self.object_id)
) )
if link_documents:
self.link_documents(documents)
return documents return documents
# Convert SPIP style internal links for other articles or sections into Markdown # Get related articles
def link_articles(self) -> None:
for match in finditer(r"\[(.*?)]\((?:art|article)([0-9]+)\)", self.texte):
article = Article.get(Article.id_article == match.group(2))
if len(match.group(1)) > 0:
title: str = match.group(1)
else:
title: str = article.titre
self.texte = self.texte.replace(
match.group(0), f"[{title}]({article.dir_slug()}/{article.filename()})"
)
# Output related articles
def articles(self) -> ModelSelect: def articles(self) -> ModelSelect:
return ( return (
Article.select() Article.select()
@ -322,10 +329,6 @@ class SpipObject(SpipWritable):
# Write object to output destination # Write object to output destination
def write(self, parent_dir: str) -> str: def write(self, parent_dir: str) -> str:
# Link articles
self.link_articles()
# Convert body after linking articles
self.texte = self.convert(self.texte)
# Define actual export directory # Define actual export directory
directory: str = parent_dir + self.dir_slug() directory: str = parent_dir + self.dir_slug()
# Make a directory for this object if there isnt # Make a directory for this object if there isnt