image & documents links conversion ok

This commit is contained in:
Guilhem Fauré 2023-05-23 13:40:32 +02:00
parent 94b593da93
commit dbfe417870
3 changed files with 49 additions and 22 deletions

View File

@ -1,5 +1,5 @@
# pyright: strict
from re import I, S, compile, finditer
from re import I, S, compile, finditer, sub
from typing import Optional
# SPIP syntax to Markdown
@ -18,7 +18,7 @@ spip_to_markdown = (
r"## \1", # Translate SPIP headings to h2
),
( # strong
compile(r"\{\{ *(.*?) *\}\}", S | I),
compile(r"\{\{ *(.*?) *\}\} ?", S | I),
r"**\1** ",
),
( # html strong
@ -26,7 +26,7 @@ spip_to_markdown = (
r"**\1**",
),
( # emphasis
compile(r"\{ *(.*?) *\}", S | I),
compile(r"\{ *(.*?) *\} ?", S | I),
r"*\1* ",
),
( # html emphasis
@ -44,10 +44,6 @@ spip_to_markdown = (
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
r"[\1](\2)",
),
( # document anchor
compile(r"<(?:doc|emb)(.*?)(\|.*?)*>", S | I),
r"[document](\1)",
),
( # wikilink
compile(r"\[\? *(.*?) *\]", S | I),
r"[\1](https://wikipedia.org/wiki/\1)",
@ -74,7 +70,7 @@ spip_to_markdown = (
),
( # table-metadata
compile(r"(\r?\n)\|\|(.*?)\|(.*?)\|\|", S | I),
r"",
r"", # Remove it
),
( # quote
compile(
@ -97,22 +93,15 @@ spip_to_markdown = (
),
"```\n\\1\n\n```",
),
( # Keep only the first language in multi-language blocks
( # WARNING Keep only the first language in multi-language blocks
compile(
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
S | I,
),
r"\1",
),
( # WARNING remove every html tag
compile(r"<\/?.*?> *", S | I),
r"",
),
)
## Match SPIP images
spip_image = compile(r"<(?:img|image)(.*?)(\|.*?)*>", S | I)
spip_to_text = (
( # strong
compile(r"\{\{ *(.*?) *\}\}", S | I),
@ -148,16 +137,19 @@ spip_to_text = (
compile(r"<\/?.*?> *", S | I),
r"",
),
( # beginning with angle bracket(s)
( # Remove beginning with angle bracket(s)
compile(r"^>+ +", S | I),
r"",
),
( # beginning with a number followed by a dot
( # Remove beginning with a number followed by a dot
compile(r"^\d+\. +", S | I),
r"",
),
)
# HTML tag WARNING can be used to remove them all
html_tag = compile(r"<\/?.*?> *", S | I)
# Broken ISO encoding to proper UTF-8
iso_to_utf = (
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
@ -300,6 +292,27 @@ def convert_meta(text: Optional[str]) -> str:
return text
# Replace images & documents in SPIP text with Markdown links with human-readable names
def convert_documents(text: str, documents: list[tuple[int, str, str]]) -> str:
for id, name, slug in documents:
text = sub(
r"<(?:img|image)" + str(id) + r"(\|.*?)*>",
f"![{name}]({slug})",
text,
)
text = sub(
r"<(?:doc|emb)" + str(id) + r"(\|.*?)*>",
f"[{name}]({slug})",
text,
)
text = sub(
r"\[(.*?)\]\((?:doc|emb)" + str(id) + r"(\|.*?)*\)",
f"[\\1]({slug})",
text,
)
return text
# Replace unknown chars with empty strings (delete them)
def remove_unknown_chars(text: str) -> str:
for char in unknown_iso:
@ -307,6 +320,11 @@ def remove_unknown_chars(text: str) -> str:
return text
# Replace HTML tags chars with empty strings (delete them)
def remove_tags(text: str) -> str:
return html_tag.sub("", text)
# Return a list of tuples giving the start and end of unknown substring in text
def unknown_chars(text: str) -> list[tuple[int, int]]:
positions: list[tuple[int, int]] = []

View File

@ -5,7 +5,7 @@ from typing import Any, Optional
from slugify import slugify
from yaml import dump
from converter import convert_body, convert_meta
from converter import convert_body, convert_documents, convert_meta, remove_tags
from database import (
SpipArticles,
SpipAuteurs,
@ -98,6 +98,12 @@ class Article(Item):
# self.referers: int = article.referers # USELESS in static
# self.popularity: float = article.popularite # USELESS in static
# self.version = article.id_version # USELESS
# Convert images & files links
self.text = convert_documents(
self.text, [(d.id, d.title, d.get_slug()) for d, _ in self.get_documents()]
)
# Remove remaining HTML after
self.text = remove_tags(self.text)
def get_authors(self) -> list[SpipAuteurs]:
return (
@ -139,6 +145,9 @@ class Article(Item):
body += "\n\n# MICROBLOGGING\n\n" + self.microblog
return body
def get_documents(self):
return Documents(self.id)
class Section(Item):
def __init__(self, section: SpipRubriques) -> None:

View File

@ -119,7 +119,7 @@ if __name__ == "__main__": # Only if script is directly executed
if len(get_unknown_chars(article.text)) > 0:
unknown_chars_articles.append(article)
# Loop over articles related files (images …)
for document, counter in Documents(article.id):
for document, counter in article.get_documents():
if counter.count % 100 == 0:
s: str = "s" if counter.remaining() > 1 else ""
print(" Exporting", end="")