diff --git a/spip2md/regexmap.py b/spip2md/regexmap.py index 0a2332f..ef9616c 100644 --- a/spip2md/regexmap.py +++ b/spip2md/regexmap.py @@ -5,17 +5,17 @@ from re import I, S, compile # ((SPIP syntax, Replacement Markdown syntax), …) SPIP_MARKDOWN = ( ( # horizontal rule - compile(r"- ?- ?- ?- ?[\- ]*|
", S | I), + compile(r"\r?\n?\r?\n?- ?- ?- ?- ?[\- ]*\r?\n?\r?\n?|
", I), # r"---", - r"***", + "\n\n***\n\n", ), ( # line break - compile(r"\r?\n_ *(?=\r?\n)|
", S | I), - "\n", + compile(r"\r?\n_ *(?=\r?\n)|
", I), + "\n", # WARNING not the real translation ), ( # heading - compile(r"\{\{\{ *(.*?) *\}\}\}", S | I), - r"## \1", # Translate SPIP headings to h2 + compile(r"\r?\n?\r?\n?\{\{\{ *(.*?) *\}\}\}\r?\n?\r?\n?", S | I), + "\n\n## \\1\n\n", # Translate SPIP headings to h2 ), ( # strong compile(r"\{\{ *(.*?) *\}\} ?", S | I), @@ -40,18 +40,18 @@ SPIP_MARKDOWN = ( ), r"~\1~", ), - ( # images - compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I), - r"![](\2)", # Needs to be further processed to replace ID with filename - ), - ( # documents & embeds - compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I), - r"[](\2)", # Needs to be further processed to replace ID with filename - ), - ( # internal links - compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I), - r"[](\2)", # Needs to be further processed to replace ID with filename - ), + # ( # images # processed by a specific function + # compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I), + # r"![](\2)", + # ), + # ( # documents & embeds # processed by a specific function + # compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I), + # r"[](\2)", + # ), + # ( # internal links # processed by a specific function + # compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I), + # r"[](\2)", + # ), ( # anchor compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I), r"[\1](\2)", @@ -62,7 +62,7 @@ SPIP_MARKDOWN = ( ), ( # footnote compile(r"\[\[ *(.*?) *\]\]", S | I), - r"", + r"", # WARNING remove it ), ( # unordered list compile(r"(\r?\n)-(?!#|-)\*? *", S | I), @@ -107,10 +107,35 @@ SPIP_MARKDOWN = ( ), ) -# Match against documents ID found in links, ID can be inserted with .format() -# Name and path can be further replaced with .format() -DOCUMENT_LINK = r"(!)?\[(.*?)\]\(({})\)" -DOCUMENT_LINK_REPL = r"\1[\2{}]({})" +DOCUMENT_LINK = ( + ( # SPIP style documents & embeds links + compile(r"<()(?:doc|document|emb|embed)([0-9]+)(?:\|(.*?))?>", S | I), + r"[{}]({})", + ), + ( # Markdown style documents & embeds links + compile(r"\[(.*?)\]\((?:doc|document|emb|embed)([0-9]+)(?:\|(.*?))?\)", S | I), + r"[\1{}]({})", + ), + ( # SPIP style images links + compile(r"<()(?:img|image)([0-9]+)(?:\|(.*?))?>", S | I), + r"![{}]({})", + ), + ( # Markdown style images links + compile(r"\[(.*?)\]\((?:img|image)([0-9]+)(?:\|(.*?))?\)", S | I), + r"![\1{}]({})", + ), +) # Name and path can be further replaced with .format() + +ARTICLE_LINK = ( + ( # SPIP style documents & embeds links + compile(r"<()(?:art|article)([0-9]+)(?:\|(.*?))?>", S | I), + r"[{}]({})", + ), + ( # Markdown style internal links + compile(r"\[(.*?)\]\((?:art|article)([0-9]+)(?:\|(.*?))?\)", S | I), + r"[\1{}]({})", + ), +) # Name and path can be further replaced with .format() # Multi language block, to be further processed per lang MULTILANG_BLOCK = compile(r"(.+?)<\/multi>", S | I) @@ -120,12 +145,12 @@ MULTILANGS = compile( # WARNING probably useless text in metadata fields, to be removed BLOAT = ( - compile(r"^>+ +", S | I), # Remove beginning with angle bracket(s) - compile(r"^\d+\. +", S | I), # Remove beginning with a number followed by a dot + compile(r"^>+ +"), # Remove beginning with angle bracket(s) + compile(r"^\d+\. +"), # Remove beginning with a number followed by a dot ) # Matches against every HTML tag -HTMLTAG = compile(r"<\/?.*?>\s*", S | I) +HTMLTAG = compile(r"<\/?.*?>\s*", S) # ((Broken ISO 8859-1 encoding, Proper UTF equivalent encoding), …) diff --git a/spip2md/spipobjects.py b/spip2md/spipobjects.py index baa8980..d07aa11 100644 --- a/spip2md/spipobjects.py +++ b/spip2md/spipobjects.py @@ -2,7 +2,7 @@ import logging from os import makedirs, remove from os.path import basename, splitext -from re import finditer, search, sub +from re import finditer, search from shutil import copyfile from typing import Any, Match, Optional @@ -20,9 +20,9 @@ from spip2md.database import ( SpipRubriques, ) from spip2md.regexmap import ( + ARTICLE_LINK, BLOAT, DOCUMENT_LINK, - DOCUMENT_LINK_REPL, HTMLTAG, ISO_UTF, MULTILANG_BLOCK, @@ -72,14 +72,6 @@ class SpipWritable: return MULTILANG_BLOCK.sub(replace_lang, text) - # Remove remaining HTML tags - @staticmethod - def clean_html(string: str) -> str: - if string is not None and len(string) > 0: - return HTMLTAG.sub("", string) - else: - return "" - # Apply different mappings to a text field, like SPIP to Markdown or encoding def convert(self, text: Optional[str], clean_html: bool = True) -> str: # Return unknown char surrounded by context_length chars @@ -108,7 +100,7 @@ class SpipWritable: text = self.translate(text) # Delete remaining HTML tags in body WARNING if clean_html: - text = self.clean_html(text) + text = HTMLTAG.sub("", text) # Warn about unknown chars for char in UNKNOWN_ISO: lastend: int = 0 @@ -222,26 +214,55 @@ class SpipObject(SpipWritable): descriptif: str extra: str + def convert(self, text: Optional[str], clean_html: bool = True) -> str: + if text is not None and len(text) > 0: + for id_link, path_link in DOCUMENT_LINK: + for match in id_link.finditer(text): + doc: Document = Document.get(Document.id_document == match.group(2)) + if doc is not None: + text = text.replace( + match.group(), path_link.format(doc.titre, doc.filename()) + ) + else: + logging.warn( + f"No document for link {match.group()} in {self.titre}" + ) + text = text.replace( + match.group(), path_link.format("", "NOT FOUND") + ) + for id_link, path_link in ARTICLE_LINK: + for match in id_link.finditer(text): + art: Article = Article.get(Article.id_article == match.group(2)) + if art is not None: + text = text.replace( + match.group(), + path_link.format( + art.titre, f"{art.dir_slug()}/{art.filename()}" + ), + ) + else: + logging.warn( + f"No article for link {match.group()} in {self.titre}" + ) + text = text.replace( + match.group(), path_link.format("", "NOT FOUND") + ) + else: + return "" + return super().convert(text, clean_html) + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Common fields that need conversions + self.texte = self.convert(self.texte) self.extra: str = self.convert(self.extra) self.statut: str = "false" if self.statut == "publie" else "true" self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true" # Define file prefix (needs to be redefined for sections) self.prefix = "index" - # Convert SPIP style internal links for images & other files into Markdown style - def link_documents(self, documents: ModelSelect) -> None: - for d in documents: - self.texte = sub( - DOCUMENT_LINK.format(d.id_document), - DOCUMENT_LINK_REPL.format(d.titre, d.filename()), - self.texte, - ) - - # Output related documents & link them in the text by the way - def documents(self, link_documents: bool = True) -> ModelSelect: + # Get related documents + def documents(self) -> ModelSelect: documents = ( Document.select() .join( @@ -250,23 +271,9 @@ class SpipObject(SpipWritable): ) .where(SpipDocumentsLiens.id_objet == self.object_id) ) - if link_documents: - self.link_documents(documents) return documents - # Convert SPIP style internal links for other articles or sections into Markdown - def link_articles(self) -> None: - for match in finditer(r"\[(.*?)]\((?:art|article)([0-9]+)\)", self.texte): - article = Article.get(Article.id_article == match.group(2)) - if len(match.group(1)) > 0: - title: str = match.group(1) - else: - title: str = article.titre - self.texte = self.texte.replace( - match.group(0), f"[{title}]({article.dir_slug()}/{article.filename()})" - ) - - # Output related articles + # Get related articles def articles(self) -> ModelSelect: return ( Article.select() @@ -322,10 +329,6 @@ class SpipObject(SpipWritable): # Write object to output destination def write(self, parent_dir: str) -> str: - # Link articles - self.link_articles() - # Convert body after linking articles - self.texte = self.convert(self.texte) # Define actual export directory directory: str = parent_dir + self.dir_slug() # Make a directory for this object if there isn’t