diff --git a/spip2md/converter.py b/spip2md/converter.py
index 83ed6a8..98af255 100644
--- a/spip2md/converter.py
+++ b/spip2md/converter.py
@@ -1,5 +1,5 @@
# pyright: strict
-from re import I, S, compile, finditer
+from re import I, S, compile, finditer, sub
from typing import Optional
# SPIP syntax to Markdown
@@ -18,16 +18,16 @@ spip_to_markdown = (
r"## \1", # Translate SPIP headings to h2
),
( # strong
- compile(r"\{\{ *(.*?) *\}\}", S | I),
- r"**\1**",
+ compile(r"\{\{ *(.*?) *\}\} ?", S | I),
+ r"**\1** ",
),
( # html strong
compile(r" *(.*?) *", S | I),
r"**\1**",
),
( # emphasis
- compile(r"\{ *(.*?) *\}", S | I),
- r"*\1*",
+ compile(r"\{ *(.*?) *\} ?", S | I),
+ r"*\1* ",
),
( # html emphasis
compile(r" *(.*?) *<\/i>", S | I),
@@ -44,10 +44,6 @@ spip_to_markdown = (
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
r"[\1](\2)",
),
- ( # document anchor
- compile(r"<(?:doc|emb)(.*?)(\|.*?)*>", S | I),
- r"[document](\1)",
- ),
( # wikilink
compile(r"\[\? *(.*?) *\]", S | I),
r"[\1](https://wikipedia.org/wiki/\1)",
@@ -74,7 +70,7 @@ spip_to_markdown = (
),
( # table-metadata
compile(r"(\r?\n)\|\|(.*?)\|(.*?)\|\|", S | I),
- r"",
+ r"", # Remove it
),
( # quote
compile(
@@ -97,22 +93,15 @@ spip_to_markdown = (
),
"```\n\\1\n\n```",
),
- ( # Keep only the first language in multi-language blocks
+ ( # WARNING Keep only the first language in multi-language blocks
compile(
r"\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
S | I,
),
r"\1",
),
- ( # WARNING remove every html tag
- compile(r"<\/?.*?> *", S | I),
- r"",
- ),
)
-## Match SPIP images
-spip_image = compile(r"<(?:img|image)(.*?)(\|.*?)*>", S | I)
-
spip_to_text = (
( # strong
compile(r"\{\{ *(.*?) *\}\}", S | I),
@@ -148,16 +137,19 @@ spip_to_text = (
compile(r"<\/?.*?> *", S | I),
r"",
),
- ( # beginning with angle bracket(s)
+ ( # Remove beginning with angle bracket(s)
compile(r"^>+ +", S | I),
r"",
),
- ( # beginning with a number followed by a dot
+ ( # Remove beginning with a number followed by a dot
compile(r"^\d+\. +", S | I),
r"",
),
)
+# HTML tag WARNING can be used to remove them all
+html_tag = compile(r"<\/?.*?> *", S | I)
+
# Broken ISO encoding to proper UTF-8
iso_to_utf = (
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
@@ -300,6 +292,27 @@ def convert_meta(text: Optional[str]) -> str:
return text
+# Replace images & documents in SPIP text with Markdown links with human-readable names
+def convert_documents(text: str, documents: list[tuple[int, str, str]]) -> str:
+ for id, name, slug in documents:
+ text = sub(
+ r"<(?:img|image)" + str(id) + r"(\|.*?)*>",
+ f"![{name}]({slug})",
+ text,
+ )
+ text = sub(
+ r"<(?:doc|emb)" + str(id) + r"(\|.*?)*>",
+ f"[{name}]({slug})",
+ text,
+ )
+ text = sub(
+ r"\[(.*?)\]\((?:doc|emb)" + str(id) + r"(\|.*?)*\)",
+ f"[\\1]({slug})",
+ text,
+ )
+ return text
+
+
# Replace unknown chars with empty strings (delete them)
def remove_unknown_chars(text: str) -> str:
for char in unknown_iso:
@@ -307,6 +320,11 @@ def remove_unknown_chars(text: str) -> str:
return text
+# Replace HTML tags chars with empty strings (delete them)
+def remove_tags(text: str) -> str:
+ return html_tag.sub("", text)
+
+
# Return a list of tuples giving the start and end of unknown substring in text
def unknown_chars(text: str) -> list[tuple[int, int]]:
positions: list[tuple[int, int]] = []
diff --git a/spip2md/items.py b/spip2md/items.py
index 5bc6f04..4a8b6ee 100644
--- a/spip2md/items.py
+++ b/spip2md/items.py
@@ -5,7 +5,7 @@ from typing import Any, Optional
from slugify import slugify
from yaml import dump
-from converter import convert_body, convert_meta
+from converter import convert_body, convert_documents, convert_meta, remove_tags
from database import (
SpipArticles,
SpipAuteurs,
@@ -98,6 +98,12 @@ class Article(Item):
# self.referers: int = article.referers # USELESS in static
# self.popularity: float = article.popularite # USELESS in static
# self.version = article.id_version # USELESS
+ # Convert images & files links
+ self.text = convert_documents(
+ self.text, [(d.id, d.title, d.get_slug()) for d, _ in self.get_documents()]
+ )
+ # Remove remaining HTML after
+ self.text = remove_tags(self.text)
def get_authors(self) -> list[SpipAuteurs]:
return (
@@ -139,6 +145,9 @@ class Article(Item):
body += "\n\n# MICROBLOGGING\n\n" + self.microblog
return body
+ def get_documents(self):
+ return Documents(self.id)
+
class Section(Item):
def __init__(self, section: SpipRubriques) -> None:
diff --git a/spip2md/main.py b/spip2md/main.py
index 59cf88b..6c302ca 100755
--- a/spip2md/main.py
+++ b/spip2md/main.py
@@ -119,7 +119,7 @@ if __name__ == "__main__": # Only if script is directly executed
if len(get_unknown_chars(article.text)) > 0:
unknown_chars_articles.append(article)
# Loop over article’s related files (images …)
- for document, counter in Documents(article.id):
+ for document, counter in article.get_documents():
if counter.count % 100 == 0:
s: str = "s" if counter.remaining() > 1 else ""
print(" Exporting", end="")