diff --git a/spip2md/converter.py b/spip2md/converter.py index 83ed6a8..98af255 100644 --- a/spip2md/converter.py +++ b/spip2md/converter.py @@ -1,5 +1,5 @@ # pyright: strict -from re import I, S, compile, finditer +from re import I, S, compile, finditer, sub from typing import Optional # SPIP syntax to Markdown @@ -18,16 +18,16 @@ spip_to_markdown = ( r"## \1", # Translate SPIP headings to h2 ), ( # strong - compile(r"\{\{ *(.*?) *\}\}", S | I), - r"**\1**", + compile(r"\{\{ *(.*?) *\}\} ?", S | I), + r"**\1** ", ), ( # html strong compile(r" *(.*?) *", S | I), r"**\1**", ), ( # emphasis - compile(r"\{ *(.*?) *\}", S | I), - r"*\1*", + compile(r"\{ *(.*?) *\} ?", S | I), + r"*\1* ", ), ( # html emphasis compile(r" *(.*?) *<\/i>", S | I), @@ -44,10 +44,6 @@ spip_to_markdown = ( compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I), r"[\1](\2)", ), - ( # document anchor - compile(r"<(?:doc|emb)(.*?)(\|.*?)*>", S | I), - r"[document](\1)", - ), ( # wikilink compile(r"\[\? *(.*?) *\]", S | I), r"[\1](https://wikipedia.org/wiki/\1)", @@ -74,7 +70,7 @@ spip_to_markdown = ( ), ( # table-metadata compile(r"(\r?\n)\|\|(.*?)\|(.*?)\|\|", S | I), - r"", + r"", # Remove it ), ( # quote compile( @@ -97,22 +93,15 @@ spip_to_markdown = ( ), "```\n\\1\n\n```", ), - ( # Keep only the first language in multi-language blocks + ( # WARNING Keep only the first language in multi-language blocks compile( r"\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>", S | I, ), r"\1", ), - ( # WARNING remove every html tag - compile(r"<\/?.*?> *", S | I), - r"", - ), ) -## Match SPIP images -spip_image = compile(r"<(?:img|image)(.*?)(\|.*?)*>", S | I) - spip_to_text = ( ( # strong compile(r"\{\{ *(.*?) *\}\}", S | I), @@ -148,16 +137,19 @@ spip_to_text = ( compile(r"<\/?.*?> *", S | I), r"", ), - ( # beginning with angle bracket(s) + ( # Remove beginning with angle bracket(s) compile(r"^>+ +", S | I), r"", ), - ( # beginning with a number followed by a dot + ( # Remove beginning with a number followed by a dot compile(r"^\d+\. +", S | I), r"", ), ) +# HTML tag WARNING can be used to remove them all +html_tag = compile(r"<\/?.*?> *", S | I) + # Broken ISO encoding to proper UTF-8 iso_to_utf = ( ( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1 @@ -300,6 +292,27 @@ def convert_meta(text: Optional[str]) -> str: return text +# Replace images & documents in SPIP text with Markdown links with human-readable names +def convert_documents(text: str, documents: list[tuple[int, str, str]]) -> str: + for id, name, slug in documents: + text = sub( + r"<(?:img|image)" + str(id) + r"(\|.*?)*>", + f"![{name}]({slug})", + text, + ) + text = sub( + r"<(?:doc|emb)" + str(id) + r"(\|.*?)*>", + f"[{name}]({slug})", + text, + ) + text = sub( + r"\[(.*?)\]\((?:doc|emb)" + str(id) + r"(\|.*?)*\)", + f"[\\1]({slug})", + text, + ) + return text + + # Replace unknown chars with empty strings (delete them) def remove_unknown_chars(text: str) -> str: for char in unknown_iso: @@ -307,6 +320,11 @@ def remove_unknown_chars(text: str) -> str: return text +# Replace HTML tags chars with empty strings (delete them) +def remove_tags(text: str) -> str: + return html_tag.sub("", text) + + # Return a list of tuples giving the start and end of unknown substring in text def unknown_chars(text: str) -> list[tuple[int, int]]: positions: list[tuple[int, int]] = [] diff --git a/spip2md/items.py b/spip2md/items.py index 5bc6f04..4a8b6ee 100644 --- a/spip2md/items.py +++ b/spip2md/items.py @@ -5,7 +5,7 @@ from typing import Any, Optional from slugify import slugify from yaml import dump -from converter import convert_body, convert_meta +from converter import convert_body, convert_documents, convert_meta, remove_tags from database import ( SpipArticles, SpipAuteurs, @@ -98,6 +98,12 @@ class Article(Item): # self.referers: int = article.referers # USELESS in static # self.popularity: float = article.popularite # USELESS in static # self.version = article.id_version # USELESS + # Convert images & files links + self.text = convert_documents( + self.text, [(d.id, d.title, d.get_slug()) for d, _ in self.get_documents()] + ) + # Remove remaining HTML after + self.text = remove_tags(self.text) def get_authors(self) -> list[SpipAuteurs]: return ( @@ -139,6 +145,9 @@ class Article(Item): body += "\n\n# MICROBLOGGING\n\n" + self.microblog return body + def get_documents(self): + return Documents(self.id) + class Section(Item): def __init__(self, section: SpipRubriques) -> None: diff --git a/spip2md/main.py b/spip2md/main.py index 59cf88b..6c302ca 100755 --- a/spip2md/main.py +++ b/spip2md/main.py @@ -119,7 +119,7 @@ if __name__ == "__main__": # Only if script is directly executed if len(get_unknown_chars(article.text)) > 0: unknown_chars_articles.append(article) # Loop over article’s related files (images …) - for document, counter in Documents(article.id): + for document, counter in article.get_documents(): if counter.count % 100 == 0: s: str = "s" if counter.remaining() > 1 else "" print(" Exporting", end="")