image & documents links conversion ok

This commit is contained in:
Guilhem Fauré 2023-05-23 13:40:32 +02:00
parent 94b593da93
commit dbfe417870
3 changed files with 49 additions and 22 deletions

View File

@ -1,5 +1,5 @@
# pyright: strict # pyright: strict
from re import I, S, compile, finditer from re import I, S, compile, finditer, sub
from typing import Optional from typing import Optional
# SPIP syntax to Markdown # SPIP syntax to Markdown
@ -18,16 +18,16 @@ spip_to_markdown = (
r"## \1", # Translate SPIP headings to h2 r"## \1", # Translate SPIP headings to h2
), ),
( # strong ( # strong
compile(r"\{\{ *(.*?) *\}\}", S | I), compile(r"\{\{ *(.*?) *\}\} ?", S | I),
r"**\1**", r"**\1** ",
), ),
( # html strong ( # html strong
compile(r"<strong> *(.*?) *</strong>", S | I), compile(r"<strong> *(.*?) *</strong>", S | I),
r"**\1**", r"**\1**",
), ),
( # emphasis ( # emphasis
compile(r"\{ *(.*?) *\}", S | I), compile(r"\{ *(.*?) *\} ?", S | I),
r"*\1*", r"*\1* ",
), ),
( # html emphasis ( # html emphasis
compile(r"<i> *(.*?) *<\/i>", S | I), compile(r"<i> *(.*?) *<\/i>", S | I),
@ -44,10 +44,6 @@ spip_to_markdown = (
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I), compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
r"[\1](\2)", r"[\1](\2)",
), ),
( # document anchor
compile(r"<(?:doc|emb)(.*?)(\|.*?)*>", S | I),
r"[document](\1)",
),
( # wikilink ( # wikilink
compile(r"\[\? *(.*?) *\]", S | I), compile(r"\[\? *(.*?) *\]", S | I),
r"[\1](https://wikipedia.org/wiki/\1)", r"[\1](https://wikipedia.org/wiki/\1)",
@ -74,7 +70,7 @@ spip_to_markdown = (
), ),
( # table-metadata ( # table-metadata
compile(r"(\r?\n)\|\|(.*?)\|(.*?)\|\|", S | I), compile(r"(\r?\n)\|\|(.*?)\|(.*?)\|\|", S | I),
r"", r"", # Remove it
), ),
( # quote ( # quote
compile( compile(
@ -97,22 +93,15 @@ spip_to_markdown = (
), ),
"```\n\\1\n\n```", "```\n\\1\n\n```",
), ),
( # Keep only the first language in multi-language blocks ( # WARNING Keep only the first language in multi-language blocks
compile( compile(
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>", r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
S | I, S | I,
), ),
r"\1", r"\1",
), ),
( # WARNING remove every html tag
compile(r"<\/?.*?> *", S | I),
r"",
),
) )
## Match SPIP images
spip_image = compile(r"<(?:img|image)(.*?)(\|.*?)*>", S | I)
spip_to_text = ( spip_to_text = (
( # strong ( # strong
compile(r"\{\{ *(.*?) *\}\}", S | I), compile(r"\{\{ *(.*?) *\}\}", S | I),
@ -148,16 +137,19 @@ spip_to_text = (
compile(r"<\/?.*?> *", S | I), compile(r"<\/?.*?> *", S | I),
r"", r"",
), ),
( # beginning with angle bracket(s) ( # Remove beginning with angle bracket(s)
compile(r"^>+ +", S | I), compile(r"^>+ +", S | I),
r"", r"",
), ),
( # beginning with a number followed by a dot ( # Remove beginning with a number followed by a dot
compile(r"^\d+\. +", S | I), compile(r"^\d+\. +", S | I),
r"", r"",
), ),
) )
# HTML tag WARNING can be used to remove them all
html_tag = compile(r"<\/?.*?> *", S | I)
# Broken ISO encoding to proper UTF-8 # Broken ISO encoding to proper UTF-8
iso_to_utf = ( iso_to_utf = (
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1 ( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
@ -300,6 +292,27 @@ def convert_meta(text: Optional[str]) -> str:
return text return text
# Replace images & documents in SPIP text with Markdown links with human-readable names
def convert_documents(text: str, documents: list[tuple[int, str, str]]) -> str:
for id, name, slug in documents:
text = sub(
r"<(?:img|image)" + str(id) + r"(\|.*?)*>",
f"![{name}]({slug})",
text,
)
text = sub(
r"<(?:doc|emb)" + str(id) + r"(\|.*?)*>",
f"[{name}]({slug})",
text,
)
text = sub(
r"\[(.*?)\]\((?:doc|emb)" + str(id) + r"(\|.*?)*\)",
f"[\\1]({slug})",
text,
)
return text
# Replace unknown chars with empty strings (delete them) # Replace unknown chars with empty strings (delete them)
def remove_unknown_chars(text: str) -> str: def remove_unknown_chars(text: str) -> str:
for char in unknown_iso: for char in unknown_iso:
@ -307,6 +320,11 @@ def remove_unknown_chars(text: str) -> str:
return text return text
# Replace HTML tags chars with empty strings (delete them)
def remove_tags(text: str) -> str:
return html_tag.sub("", text)
# Return a list of tuples giving the start and end of unknown substring in text # Return a list of tuples giving the start and end of unknown substring in text
def unknown_chars(text: str) -> list[tuple[int, int]]: def unknown_chars(text: str) -> list[tuple[int, int]]:
positions: list[tuple[int, int]] = [] positions: list[tuple[int, int]] = []

View File

@ -5,7 +5,7 @@ from typing import Any, Optional
from slugify import slugify from slugify import slugify
from yaml import dump from yaml import dump
from converter import convert_body, convert_meta from converter import convert_body, convert_documents, convert_meta, remove_tags
from database import ( from database import (
SpipArticles, SpipArticles,
SpipAuteurs, SpipAuteurs,
@ -98,6 +98,12 @@ class Article(Item):
# self.referers: int = article.referers # USELESS in static # self.referers: int = article.referers # USELESS in static
# self.popularity: float = article.popularite # USELESS in static # self.popularity: float = article.popularite # USELESS in static
# self.version = article.id_version # USELESS # self.version = article.id_version # USELESS
# Convert images & files links
self.text = convert_documents(
self.text, [(d.id, d.title, d.get_slug()) for d, _ in self.get_documents()]
)
# Remove remaining HTML after
self.text = remove_tags(self.text)
def get_authors(self) -> list[SpipAuteurs]: def get_authors(self) -> list[SpipAuteurs]:
return ( return (
@ -139,6 +145,9 @@ class Article(Item):
body += "\n\n# MICROBLOGGING\n\n" + self.microblog body += "\n\n# MICROBLOGGING\n\n" + self.microblog
return body return body
def get_documents(self):
return Documents(self.id)
class Section(Item): class Section(Item):
def __init__(self, section: SpipRubriques) -> None: def __init__(self, section: SpipRubriques) -> None:

View File

@ -119,7 +119,7 @@ if __name__ == "__main__": # Only if script is directly executed
if len(get_unknown_chars(article.text)) > 0: if len(get_unknown_chars(article.text)) > 0:
unknown_chars_articles.append(article) unknown_chars_articles.append(article)
# Loop over articles related files (images …) # Loop over articles related files (images …)
for document, counter in Documents(article.id): for document, counter in article.get_documents():
if counter.count % 100 == 0: if counter.count % 100 == 0:
s: str = "s" if counter.remaining() > 1 else "" s: str = "s" if counter.remaining() > 1 else ""
print(" Exporting", end="") print(" Exporting", end="")