image & documents links conversion ok
This commit is contained in:
parent
94b593da93
commit
dbfe417870
@ -1,5 +1,5 @@
|
||||
# pyright: strict
|
||||
from re import I, S, compile, finditer
|
||||
from re import I, S, compile, finditer, sub
|
||||
from typing import Optional
|
||||
|
||||
# SPIP syntax to Markdown
|
||||
@ -18,7 +18,7 @@ spip_to_markdown = (
|
||||
r"## \1", # Translate SPIP headings to h2
|
||||
),
|
||||
( # strong
|
||||
compile(r"\{\{ *(.*?) *\}\}", S | I),
|
||||
compile(r"\{\{ *(.*?) *\}\} ?", S | I),
|
||||
r"**\1** ",
|
||||
),
|
||||
( # html strong
|
||||
@ -26,7 +26,7 @@ spip_to_markdown = (
|
||||
r"**\1**",
|
||||
),
|
||||
( # emphasis
|
||||
compile(r"\{ *(.*?) *\}", S | I),
|
||||
compile(r"\{ *(.*?) *\} ?", S | I),
|
||||
r"*\1* ",
|
||||
),
|
||||
( # html emphasis
|
||||
@ -44,10 +44,6 @@ spip_to_markdown = (
|
||||
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
|
||||
r"[\1](\2)",
|
||||
),
|
||||
( # document anchor
|
||||
compile(r"<(?:doc|emb)(.*?)(\|.*?)*>", S | I),
|
||||
r"[document](\1)",
|
||||
),
|
||||
( # wikilink
|
||||
compile(r"\[\? *(.*?) *\]", S | I),
|
||||
r"[\1](https://wikipedia.org/wiki/\1)",
|
||||
@ -74,7 +70,7 @@ spip_to_markdown = (
|
||||
),
|
||||
( # table-metadata
|
||||
compile(r"(\r?\n)\|\|(.*?)\|(.*?)\|\|", S | I),
|
||||
r"",
|
||||
r"", # Remove it
|
||||
),
|
||||
( # quote
|
||||
compile(
|
||||
@ -97,22 +93,15 @@ spip_to_markdown = (
|
||||
),
|
||||
"```\n\\1\n\n```",
|
||||
),
|
||||
( # Keep only the first language in multi-language blocks
|
||||
( # WARNING Keep only the first language in multi-language blocks
|
||||
compile(
|
||||
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
|
||||
S | I,
|
||||
),
|
||||
r"\1",
|
||||
),
|
||||
( # WARNING remove every html tag
|
||||
compile(r"<\/?.*?> *", S | I),
|
||||
r"",
|
||||
),
|
||||
)
|
||||
|
||||
## Match SPIP images
|
||||
spip_image = compile(r"<(?:img|image)(.*?)(\|.*?)*>", S | I)
|
||||
|
||||
spip_to_text = (
|
||||
( # strong
|
||||
compile(r"\{\{ *(.*?) *\}\}", S | I),
|
||||
@ -148,16 +137,19 @@ spip_to_text = (
|
||||
compile(r"<\/?.*?> *", S | I),
|
||||
r"",
|
||||
),
|
||||
( # beginning with angle bracket(s)
|
||||
( # Remove beginning with angle bracket(s)
|
||||
compile(r"^>+ +", S | I),
|
||||
r"",
|
||||
),
|
||||
( # beginning with a number followed by a dot
|
||||
( # Remove beginning with a number followed by a dot
|
||||
compile(r"^\d+\. +", S | I),
|
||||
r"",
|
||||
),
|
||||
)
|
||||
|
||||
# HTML tag WARNING can be used to remove them all
|
||||
html_tag = compile(r"<\/?.*?> *", S | I)
|
||||
|
||||
# Broken ISO encoding to proper UTF-8
|
||||
iso_to_utf = (
|
||||
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
|
||||
@ -300,6 +292,27 @@ def convert_meta(text: Optional[str]) -> str:
|
||||
return text
|
||||
|
||||
|
||||
# Replace images & documents in SPIP text with Markdown links with human-readable names
|
||||
def convert_documents(text: str, documents: list[tuple[int, str, str]]) -> str:
|
||||
for id, name, slug in documents:
|
||||
text = sub(
|
||||
r"<(?:img|image)" + str(id) + r"(\|.*?)*>",
|
||||
f"![{name}]({slug})",
|
||||
text,
|
||||
)
|
||||
text = sub(
|
||||
r"<(?:doc|emb)" + str(id) + r"(\|.*?)*>",
|
||||
f"[{name}]({slug})",
|
||||
text,
|
||||
)
|
||||
text = sub(
|
||||
r"\[(.*?)\]\((?:doc|emb)" + str(id) + r"(\|.*?)*\)",
|
||||
f"[\\1]({slug})",
|
||||
text,
|
||||
)
|
||||
return text
|
||||
|
||||
|
||||
# Replace unknown chars with empty strings (delete them)
|
||||
def remove_unknown_chars(text: str) -> str:
|
||||
for char in unknown_iso:
|
||||
@ -307,6 +320,11 @@ def remove_unknown_chars(text: str) -> str:
|
||||
return text
|
||||
|
||||
|
||||
# Replace HTML tags chars with empty strings (delete them)
|
||||
def remove_tags(text: str) -> str:
|
||||
return html_tag.sub("", text)
|
||||
|
||||
|
||||
# Return a list of tuples giving the start and end of unknown substring in text
|
||||
def unknown_chars(text: str) -> list[tuple[int, int]]:
|
||||
positions: list[tuple[int, int]] = []
|
||||
|
@ -5,7 +5,7 @@ from typing import Any, Optional
|
||||
from slugify import slugify
|
||||
from yaml import dump
|
||||
|
||||
from converter import convert_body, convert_meta
|
||||
from converter import convert_body, convert_documents, convert_meta, remove_tags
|
||||
from database import (
|
||||
SpipArticles,
|
||||
SpipAuteurs,
|
||||
@ -98,6 +98,12 @@ class Article(Item):
|
||||
# self.referers: int = article.referers # USELESS in static
|
||||
# self.popularity: float = article.popularite # USELESS in static
|
||||
# self.version = article.id_version # USELESS
|
||||
# Convert images & files links
|
||||
self.text = convert_documents(
|
||||
self.text, [(d.id, d.title, d.get_slug()) for d, _ in self.get_documents()]
|
||||
)
|
||||
# Remove remaining HTML after
|
||||
self.text = remove_tags(self.text)
|
||||
|
||||
def get_authors(self) -> list[SpipAuteurs]:
|
||||
return (
|
||||
@ -139,6 +145,9 @@ class Article(Item):
|
||||
body += "\n\n# MICROBLOGGING\n\n" + self.microblog
|
||||
return body
|
||||
|
||||
def get_documents(self):
|
||||
return Documents(self.id)
|
||||
|
||||
|
||||
class Section(Item):
|
||||
def __init__(self, section: SpipRubriques) -> None:
|
||||
|
@ -119,7 +119,7 @@ if __name__ == "__main__": # Only if script is directly executed
|
||||
if len(get_unknown_chars(article.text)) > 0:
|
||||
unknown_chars_articles.append(article)
|
||||
# Loop over article’s related files (images …)
|
||||
for document, counter in Documents(article.id):
|
||||
for document, counter in article.get_documents():
|
||||
if counter.count % 100 == 0:
|
||||
s: str = "s" if counter.remaining() > 1 else ""
|
||||
print(" Exporting", end="")
|
||||
|
Loading…
Reference in New Issue
Block a user