internal links almost ok + repair û chars + lighter output for unknown chars
This commit is contained in:
parent
74aeb8474c
commit
0d9ad19b2f
2
.gitignore
vendored
2
.gitignore
vendored
@ -164,7 +164,7 @@ database/
|
|||||||
|
|
||||||
# Outputted Markdown files
|
# Outputted Markdown files
|
||||||
markdown/
|
markdown/
|
||||||
output/
|
out*/
|
||||||
|
|
||||||
# YAML Configuration
|
# YAML Configuration
|
||||||
spip2md.yml
|
spip2md.yml
|
||||||
|
@ -48,6 +48,10 @@ SPIP_TO_MARKDOWN = (
|
|||||||
compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I),
|
compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I),
|
||||||
r"[](\1\2)",
|
r"[](\1\2)",
|
||||||
),
|
),
|
||||||
|
( # internal links
|
||||||
|
compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I),
|
||||||
|
r"[](\1\2)",
|
||||||
|
),
|
||||||
( # anchor
|
( # anchor
|
||||||
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
|
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
|
||||||
r"[\1](\2)",
|
r"[\1](\2)",
|
||||||
@ -214,6 +218,10 @@ ISO_TO_UTF = (
|
|||||||
"°",
|
"°",
|
||||||
r"°",
|
r"°",
|
||||||
),
|
),
|
||||||
|
( # Fix UTF-8 û that was interpreted as ISO 8859-1
|
||||||
|
"û",
|
||||||
|
r"û",
|
||||||
|
),
|
||||||
( # Fix UTF-8 nbsp that was interpreted as ISO 8859-1
|
( # Fix UTF-8 nbsp that was interpreted as ISO 8859-1
|
||||||
"Â ",
|
"Â ",
|
||||||
r" ",
|
r" ",
|
||||||
@ -241,7 +249,6 @@ ISO_TO_UTF = (
|
|||||||
UNKNOWN_ISO = (
|
UNKNOWN_ISO = (
|
||||||
r"
",
|
r"
",
|
||||||
r"∆",
|
r"∆",
|
||||||
r"û",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -298,12 +305,12 @@ def unknown_chars(text: str) -> list[tuple[int, int]]:
|
|||||||
|
|
||||||
|
|
||||||
# Return strings with unknown chards found in text, surrounded by context_length chars
|
# Return strings with unknown chards found in text, surrounded by context_length chars
|
||||||
def unknown_chars_context(text: str, context_length: int = 20) -> list[str]:
|
def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
|
||||||
errors: list[str] = []
|
errors: list[str] = []
|
||||||
context: str = r".{0," + str(context_length) + r"}"
|
context: str = r".{0," + str(context_length) + r"}"
|
||||||
for char in UNKNOWN_ISO:
|
for char in UNKNOWN_ISO:
|
||||||
matches = finditer(
|
matches = finditer(
|
||||||
context + r"(?=" + char + r")" + char + r".*?(?=\r?\n|$)",
|
context + r"(?=" + char + r")" + char + context,
|
||||||
text,
|
text,
|
||||||
)
|
)
|
||||||
for match in matches:
|
for match in matches:
|
||||||
|
@ -15,15 +15,6 @@ from database import (
|
|||||||
SpipRubriques,
|
SpipRubriques,
|
||||||
)
|
)
|
||||||
|
|
||||||
EXPORTTYPE: str = "md"
|
|
||||||
|
|
||||||
ARTICLE_LINK = compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I)
|
|
||||||
|
|
||||||
|
|
||||||
def link_articles(text: str):
|
|
||||||
for match in ARTICLE_LINK.findall(text):
|
|
||||||
article = Article.select().where(Article.id_article == match[0])
|
|
||||||
|
|
||||||
|
|
||||||
class Document(SpipDocuments):
|
class Document(SpipDocuments):
|
||||||
class Meta:
|
class Meta:
|
||||||
@ -43,6 +34,9 @@ class Document(SpipDocuments):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
EXPORTTYPE: str = "md"
|
||||||
|
|
||||||
|
|
||||||
class Article(SpipArticles):
|
class Article(SpipArticles):
|
||||||
class Meta:
|
class Meta:
|
||||||
table_name: str = "spip_articles"
|
table_name: str = "spip_articles"
|
||||||
@ -73,6 +67,8 @@ class Article(SpipArticles):
|
|||||||
)
|
)
|
||||||
for d in documents:
|
for d in documents:
|
||||||
self.texte = link_document(self.texte, d.id_document, d.titre, d.slug())
|
self.texte = link_document(self.texte, d.id_document, d.titre, d.slug())
|
||||||
|
# Internal (articles) links
|
||||||
|
self.text = link_articles(self.texte)
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
def slug(self, date: bool = False) -> str:
|
def slug(self, date: bool = False) -> str:
|
||||||
@ -154,6 +150,22 @@ def get_articles(section_id: int, limit: int = 10**6) -> ModelSelect:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
ARTICLE_LINK = compile(r"\[(.*?)]\((?:art|article)([0-9]+)\)", I)
|
||||||
|
|
||||||
|
|
||||||
|
def link_articles(text: str):
|
||||||
|
for match in ARTICLE_LINK.finditer(text):
|
||||||
|
article = Article.get(Article.id_article == match.group(2))
|
||||||
|
if len(match.group(1)) > 0:
|
||||||
|
title: str = match.group(1)
|
||||||
|
else:
|
||||||
|
title: str = article.titre
|
||||||
|
text = text.replace(
|
||||||
|
match.group(0), f"[{title}]({article.slug()}/{article.filename()})"
|
||||||
|
)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
class Rubrique(SpipRubriques):
|
class Rubrique(SpipRubriques):
|
||||||
class Meta:
|
class Meta:
|
||||||
table_name: str = "spip_rubriques"
|
table_name: str = "spip_rubriques"
|
||||||
@ -178,6 +190,8 @@ class Rubrique(SpipRubriques):
|
|||||||
)
|
)
|
||||||
for d in documents:
|
for d in documents:
|
||||||
self.texte = link_document(self.texte, d.id_document, d.titre, d.slug())
|
self.texte = link_document(self.texte, d.id_document, d.titre, d.slug())
|
||||||
|
# Internal (articles) links
|
||||||
|
self.text = link_articles(self.texte)
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
def slug(self, date: bool = False) -> str:
|
def slug(self, date: bool = False) -> str:
|
||||||
|
Loading…
Reference in New Issue
Block a user