internal links almost ok + repair û chars + lighter output for unknown chars

This commit is contained in:
Guilhem Fauré 2023-05-24 15:31:52 +02:00
parent 74aeb8474c
commit 0d9ad19b2f
3 changed files with 34 additions and 13 deletions

2
.gitignore vendored
View File

@ -164,7 +164,7 @@ database/
# Outputted Markdown files
markdown/
output/
out*/
# YAML Configuration
spip2md.yml

View File

@ -48,6 +48,10 @@ SPIP_TO_MARKDOWN = (
compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I),
r"[](\1\2)",
),
( # internal links
compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I),
r"[](\1\2)",
),
( # anchor
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
r"[\1](\2)",
@ -214,6 +218,10 @@ ISO_TO_UTF = (
"°",
r"°",
),
( # Fix UTF-8 û that was interpreted as ISO 8859-1
"û",
r"û",
),
( # Fix UTF-8 nbsp that was interpreted as ISO 8859-1
" ",
r" ",
@ -241,7 +249,6 @@ ISO_TO_UTF = (
UNKNOWN_ISO = (
r"
",
r"∆",
r"û",
)
@ -298,12 +305,12 @@ def unknown_chars(text: str) -> list[tuple[int, int]]:
# Return strings with unknown chards found in text, surrounded by context_length chars
def unknown_chars_context(text: str, context_length: int = 20) -> list[str]:
def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
errors: list[str] = []
context: str = r".{0," + str(context_length) + r"}"
for char in UNKNOWN_ISO:
matches = finditer(
context + r"(?=" + char + r")" + char + r".*?(?=\r?\n|$)",
context + r"(?=" + char + r")" + char + context,
text,
)
for match in matches:

View File

@ -15,15 +15,6 @@ from database import (
SpipRubriques,
)
EXPORTTYPE: str = "md"
ARTICLE_LINK = compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I)
def link_articles(text: str):
for match in ARTICLE_LINK.findall(text):
article = Article.select().where(Article.id_article == match[0])
class Document(SpipDocuments):
class Meta:
@ -43,6 +34,9 @@ class Document(SpipDocuments):
)
EXPORTTYPE: str = "md"
class Article(SpipArticles):
class Meta:
table_name: str = "spip_articles"
@ -73,6 +67,8 @@ class Article(SpipArticles):
)
for d in documents:
self.texte = link_document(self.texte, d.id_document, d.titre, d.slug())
# Internal (articles) links
self.text = link_articles(self.texte)
return documents
def slug(self, date: bool = False) -> str:
@ -154,6 +150,22 @@ def get_articles(section_id: int, limit: int = 10**6) -> ModelSelect:
)
ARTICLE_LINK = compile(r"\[(.*?)]\((?:art|article)([0-9]+)\)", I)
def link_articles(text: str):
for match in ARTICLE_LINK.finditer(text):
article = Article.get(Article.id_article == match.group(2))
if len(match.group(1)) > 0:
title: str = match.group(1)
else:
title: str = article.titre
text = text.replace(
match.group(0), f"[{title}]({article.slug()}/{article.filename()})"
)
return text
class Rubrique(SpipRubriques):
class Meta:
table_name: str = "spip_rubriques"
@ -178,6 +190,8 @@ class Rubrique(SpipRubriques):
)
for d in documents:
self.texte = link_document(self.texte, d.id_document, d.titre, d.slug())
# Internal (articles) links
self.text = link_articles(self.texte)
return documents
def slug(self, date: bool = False) -> str: