internal links almost ok + repair û chars + lighter output for unknown chars
This commit is contained in:
parent
74aeb8474c
commit
0d9ad19b2f
2
.gitignore
vendored
2
.gitignore
vendored
@ -164,7 +164,7 @@ database/
|
||||
|
||||
# Outputted Markdown files
|
||||
markdown/
|
||||
output/
|
||||
out*/
|
||||
|
||||
# YAML Configuration
|
||||
spip2md.yml
|
||||
|
@ -48,6 +48,10 @@ SPIP_TO_MARKDOWN = (
|
||||
compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I),
|
||||
r"[](\1\2)",
|
||||
),
|
||||
( # internal links
|
||||
compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I),
|
||||
r"[](\1\2)",
|
||||
),
|
||||
( # anchor
|
||||
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
|
||||
r"[\1](\2)",
|
||||
@ -214,6 +218,10 @@ ISO_TO_UTF = (
|
||||
"°",
|
||||
r"°",
|
||||
),
|
||||
( # Fix UTF-8 û that was interpreted as ISO 8859-1
|
||||
"û",
|
||||
r"û",
|
||||
),
|
||||
( # Fix UTF-8 nbsp that was interpreted as ISO 8859-1
|
||||
"Â ",
|
||||
r" ",
|
||||
@ -241,7 +249,6 @@ ISO_TO_UTF = (
|
||||
UNKNOWN_ISO = (
|
||||
r"
",
|
||||
r"∆",
|
||||
r"û",
|
||||
)
|
||||
|
||||
|
||||
@ -298,12 +305,12 @@ def unknown_chars(text: str) -> list[tuple[int, int]]:
|
||||
|
||||
|
||||
# Return strings with unknown chards found in text, surrounded by context_length chars
|
||||
def unknown_chars_context(text: str, context_length: int = 20) -> list[str]:
|
||||
def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
|
||||
errors: list[str] = []
|
||||
context: str = r".{0," + str(context_length) + r"}"
|
||||
for char in UNKNOWN_ISO:
|
||||
matches = finditer(
|
||||
context + r"(?=" + char + r")" + char + r".*?(?=\r?\n|$)",
|
||||
context + r"(?=" + char + r")" + char + context,
|
||||
text,
|
||||
)
|
||||
for match in matches:
|
||||
|
@ -15,15 +15,6 @@ from database import (
|
||||
SpipRubriques,
|
||||
)
|
||||
|
||||
EXPORTTYPE: str = "md"
|
||||
|
||||
ARTICLE_LINK = compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I)
|
||||
|
||||
|
||||
def link_articles(text: str):
|
||||
for match in ARTICLE_LINK.findall(text):
|
||||
article = Article.select().where(Article.id_article == match[0])
|
||||
|
||||
|
||||
class Document(SpipDocuments):
|
||||
class Meta:
|
||||
@ -43,6 +34,9 @@ class Document(SpipDocuments):
|
||||
)
|
||||
|
||||
|
||||
EXPORTTYPE: str = "md"
|
||||
|
||||
|
||||
class Article(SpipArticles):
|
||||
class Meta:
|
||||
table_name: str = "spip_articles"
|
||||
@ -73,6 +67,8 @@ class Article(SpipArticles):
|
||||
)
|
||||
for d in documents:
|
||||
self.texte = link_document(self.texte, d.id_document, d.titre, d.slug())
|
||||
# Internal (articles) links
|
||||
self.text = link_articles(self.texte)
|
||||
return documents
|
||||
|
||||
def slug(self, date: bool = False) -> str:
|
||||
@ -154,6 +150,22 @@ def get_articles(section_id: int, limit: int = 10**6) -> ModelSelect:
|
||||
)
|
||||
|
||||
|
||||
ARTICLE_LINK = compile(r"\[(.*?)]\((?:art|article)([0-9]+)\)", I)
|
||||
|
||||
|
||||
def link_articles(text: str):
|
||||
for match in ARTICLE_LINK.finditer(text):
|
||||
article = Article.get(Article.id_article == match.group(2))
|
||||
if len(match.group(1)) > 0:
|
||||
title: str = match.group(1)
|
||||
else:
|
||||
title: str = article.titre
|
||||
text = text.replace(
|
||||
match.group(0), f"[{title}]({article.slug()}/{article.filename()})"
|
||||
)
|
||||
return text
|
||||
|
||||
|
||||
class Rubrique(SpipRubriques):
|
||||
class Meta:
|
||||
table_name: str = "spip_rubriques"
|
||||
@ -178,6 +190,8 @@ class Rubrique(SpipRubriques):
|
||||
)
|
||||
for d in documents:
|
||||
self.texte = link_document(self.texte, d.id_document, d.titre, d.slug())
|
||||
# Internal (articles) links
|
||||
self.text = link_articles(self.texte)
|
||||
return documents
|
||||
|
||||
def slug(self, date: bool = False) -> str:
|
||||
|
Loading…
Reference in New Issue
Block a user