image & documents links conversion ok
This commit is contained in:
parent
94b593da93
commit
dbfe417870
@ -1,5 +1,5 @@
|
|||||||
# pyright: strict
|
# pyright: strict
|
||||||
from re import I, S, compile, finditer
|
from re import I, S, compile, finditer, sub
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
# SPIP syntax to Markdown
|
# SPIP syntax to Markdown
|
||||||
@ -18,7 +18,7 @@ spip_to_markdown = (
|
|||||||
r"## \1", # Translate SPIP headings to h2
|
r"## \1", # Translate SPIP headings to h2
|
||||||
),
|
),
|
||||||
( # strong
|
( # strong
|
||||||
compile(r"\{\{ *(.*?) *\}\}", S | I),
|
compile(r"\{\{ *(.*?) *\}\} ?", S | I),
|
||||||
r"**\1** ",
|
r"**\1** ",
|
||||||
),
|
),
|
||||||
( # html strong
|
( # html strong
|
||||||
@ -26,7 +26,7 @@ spip_to_markdown = (
|
|||||||
r"**\1**",
|
r"**\1**",
|
||||||
),
|
),
|
||||||
( # emphasis
|
( # emphasis
|
||||||
compile(r"\{ *(.*?) *\}", S | I),
|
compile(r"\{ *(.*?) *\} ?", S | I),
|
||||||
r"*\1* ",
|
r"*\1* ",
|
||||||
),
|
),
|
||||||
( # html emphasis
|
( # html emphasis
|
||||||
@ -44,10 +44,6 @@ spip_to_markdown = (
|
|||||||
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
|
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
|
||||||
r"[\1](\2)",
|
r"[\1](\2)",
|
||||||
),
|
),
|
||||||
( # document anchor
|
|
||||||
compile(r"<(?:doc|emb)(.*?)(\|.*?)*>", S | I),
|
|
||||||
r"[document](\1)",
|
|
||||||
),
|
|
||||||
( # wikilink
|
( # wikilink
|
||||||
compile(r"\[\? *(.*?) *\]", S | I),
|
compile(r"\[\? *(.*?) *\]", S | I),
|
||||||
r"[\1](https://wikipedia.org/wiki/\1)",
|
r"[\1](https://wikipedia.org/wiki/\1)",
|
||||||
@ -74,7 +70,7 @@ spip_to_markdown = (
|
|||||||
),
|
),
|
||||||
( # table-metadata
|
( # table-metadata
|
||||||
compile(r"(\r?\n)\|\|(.*?)\|(.*?)\|\|", S | I),
|
compile(r"(\r?\n)\|\|(.*?)\|(.*?)\|\|", S | I),
|
||||||
r"",
|
r"", # Remove it
|
||||||
),
|
),
|
||||||
( # quote
|
( # quote
|
||||||
compile(
|
compile(
|
||||||
@ -97,22 +93,15 @@ spip_to_markdown = (
|
|||||||
),
|
),
|
||||||
"```\n\\1\n\n```",
|
"```\n\\1\n\n```",
|
||||||
),
|
),
|
||||||
( # Keep only the first language in multi-language blocks
|
( # WARNING Keep only the first language in multi-language blocks
|
||||||
compile(
|
compile(
|
||||||
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
|
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
|
||||||
S | I,
|
S | I,
|
||||||
),
|
),
|
||||||
r"\1",
|
r"\1",
|
||||||
),
|
),
|
||||||
( # WARNING remove every html tag
|
|
||||||
compile(r"<\/?.*?> *", S | I),
|
|
||||||
r"",
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
## Match SPIP images
|
|
||||||
spip_image = compile(r"<(?:img|image)(.*?)(\|.*?)*>", S | I)
|
|
||||||
|
|
||||||
spip_to_text = (
|
spip_to_text = (
|
||||||
( # strong
|
( # strong
|
||||||
compile(r"\{\{ *(.*?) *\}\}", S | I),
|
compile(r"\{\{ *(.*?) *\}\}", S | I),
|
||||||
@ -148,16 +137,19 @@ spip_to_text = (
|
|||||||
compile(r"<\/?.*?> *", S | I),
|
compile(r"<\/?.*?> *", S | I),
|
||||||
r"",
|
r"",
|
||||||
),
|
),
|
||||||
( # beginning with angle bracket(s)
|
( # Remove beginning with angle bracket(s)
|
||||||
compile(r"^>+ +", S | I),
|
compile(r"^>+ +", S | I),
|
||||||
r"",
|
r"",
|
||||||
),
|
),
|
||||||
( # beginning with a number followed by a dot
|
( # Remove beginning with a number followed by a dot
|
||||||
compile(r"^\d+\. +", S | I),
|
compile(r"^\d+\. +", S | I),
|
||||||
r"",
|
r"",
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# HTML tag WARNING can be used to remove them all
|
||||||
|
html_tag = compile(r"<\/?.*?> *", S | I)
|
||||||
|
|
||||||
# Broken ISO encoding to proper UTF-8
|
# Broken ISO encoding to proper UTF-8
|
||||||
iso_to_utf = (
|
iso_to_utf = (
|
||||||
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
|
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
|
||||||
@ -300,6 +292,27 @@ def convert_meta(text: Optional[str]) -> str:
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
# Replace images & documents in SPIP text with Markdown links with human-readable names
|
||||||
|
def convert_documents(text: str, documents: list[tuple[int, str, str]]) -> str:
|
||||||
|
for id, name, slug in documents:
|
||||||
|
text = sub(
|
||||||
|
r"<(?:img|image)" + str(id) + r"(\|.*?)*>",
|
||||||
|
f"![{name}]({slug})",
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
text = sub(
|
||||||
|
r"<(?:doc|emb)" + str(id) + r"(\|.*?)*>",
|
||||||
|
f"[{name}]({slug})",
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
text = sub(
|
||||||
|
r"\[(.*?)\]\((?:doc|emb)" + str(id) + r"(\|.*?)*\)",
|
||||||
|
f"[\\1]({slug})",
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
# Replace unknown chars with empty strings (delete them)
|
# Replace unknown chars with empty strings (delete them)
|
||||||
def remove_unknown_chars(text: str) -> str:
|
def remove_unknown_chars(text: str) -> str:
|
||||||
for char in unknown_iso:
|
for char in unknown_iso:
|
||||||
@ -307,6 +320,11 @@ def remove_unknown_chars(text: str) -> str:
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
# Replace HTML tags chars with empty strings (delete them)
|
||||||
|
def remove_tags(text: str) -> str:
|
||||||
|
return html_tag.sub("", text)
|
||||||
|
|
||||||
|
|
||||||
# Return a list of tuples giving the start and end of unknown substring in text
|
# Return a list of tuples giving the start and end of unknown substring in text
|
||||||
def unknown_chars(text: str) -> list[tuple[int, int]]:
|
def unknown_chars(text: str) -> list[tuple[int, int]]:
|
||||||
positions: list[tuple[int, int]] = []
|
positions: list[tuple[int, int]] = []
|
||||||
|
@ -5,7 +5,7 @@ from typing import Any, Optional
|
|||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
from yaml import dump
|
from yaml import dump
|
||||||
|
|
||||||
from converter import convert_body, convert_meta
|
from converter import convert_body, convert_documents, convert_meta, remove_tags
|
||||||
from database import (
|
from database import (
|
||||||
SpipArticles,
|
SpipArticles,
|
||||||
SpipAuteurs,
|
SpipAuteurs,
|
||||||
@ -98,6 +98,12 @@ class Article(Item):
|
|||||||
# self.referers: int = article.referers # USELESS in static
|
# self.referers: int = article.referers # USELESS in static
|
||||||
# self.popularity: float = article.popularite # USELESS in static
|
# self.popularity: float = article.popularite # USELESS in static
|
||||||
# self.version = article.id_version # USELESS
|
# self.version = article.id_version # USELESS
|
||||||
|
# Convert images & files links
|
||||||
|
self.text = convert_documents(
|
||||||
|
self.text, [(d.id, d.title, d.get_slug()) for d, _ in self.get_documents()]
|
||||||
|
)
|
||||||
|
# Remove remaining HTML after
|
||||||
|
self.text = remove_tags(self.text)
|
||||||
|
|
||||||
def get_authors(self) -> list[SpipAuteurs]:
|
def get_authors(self) -> list[SpipAuteurs]:
|
||||||
return (
|
return (
|
||||||
@ -139,6 +145,9 @@ class Article(Item):
|
|||||||
body += "\n\n# MICROBLOGGING\n\n" + self.microblog
|
body += "\n\n# MICROBLOGGING\n\n" + self.microblog
|
||||||
return body
|
return body
|
||||||
|
|
||||||
|
def get_documents(self):
|
||||||
|
return Documents(self.id)
|
||||||
|
|
||||||
|
|
||||||
class Section(Item):
|
class Section(Item):
|
||||||
def __init__(self, section: SpipRubriques) -> None:
|
def __init__(self, section: SpipRubriques) -> None:
|
||||||
|
@ -119,7 +119,7 @@ if __name__ == "__main__": # Only if script is directly executed
|
|||||||
if len(get_unknown_chars(article.text)) > 0:
|
if len(get_unknown_chars(article.text)) > 0:
|
||||||
unknown_chars_articles.append(article)
|
unknown_chars_articles.append(article)
|
||||||
# Loop over article’s related files (images …)
|
# Loop over article’s related files (images …)
|
||||||
for document, counter in Documents(article.id):
|
for document, counter in article.get_documents():
|
||||||
if counter.count % 100 == 0:
|
if counter.count % 100 == 0:
|
||||||
s: str = "s" if counter.remaining() > 1 else ""
|
s: str = "s" if counter.remaining() > 1 else ""
|
||||||
print(" Exporting", end="")
|
print(" Exporting", end="")
|
||||||
|
Loading…
Reference in New Issue
Block a user