diff --git a/.gitignore b/.gitignore index e5305f3..ee5a3e5 100644 --- a/.gitignore +++ b/.gitignore @@ -164,7 +164,7 @@ database/ # Outputted Markdown files markdown/ -output/ +out*/ # YAML Configuration spip2md.yml diff --git a/spip2md/converters.py b/spip2md/converters.py index 547d753..5e7bd98 100644 --- a/spip2md/converters.py +++ b/spip2md/converters.py @@ -48,6 +48,10 @@ SPIP_TO_MARKDOWN = ( compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I), r"[](\1\2)", ), + ( # internal links + compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I), + r"[](\1\2)", + ), ( # anchor compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I), r"[\1](\2)", @@ -214,6 +218,10 @@ ISO_TO_UTF = ( "°", r"°", ), + ( # Fix UTF-8 û that was interpreted as ISO 8859-1 + "û", + r"û", + ), ( # Fix UTF-8 nbsp that was interpreted as ISO 8859-1 " ", r" ", @@ -241,7 +249,6 @@ ISO_TO_UTF = ( UNKNOWN_ISO = ( r"
", r"∆", - r"û", ) @@ -298,12 +305,12 @@ def unknown_chars(text: str) -> list[tuple[int, int]]: # Return strings with unknown chards found in text, surrounded by context_length chars -def unknown_chars_context(text: str, context_length: int = 20) -> list[str]: +def unknown_chars_context(text: str, context_length: int = 24) -> list[str]: errors: list[str] = [] context: str = r".{0," + str(context_length) + r"}" for char in UNKNOWN_ISO: matches = finditer( - context + r"(?=" + char + r")" + char + r".*?(?=\r?\n|$)", + context + r"(?=" + char + r")" + char + context, text, ) for match in matches: diff --git a/spip2md/spipobjects.py b/spip2md/spipobjects.py index f7d5338..f66f246 100644 --- a/spip2md/spipobjects.py +++ b/spip2md/spipobjects.py @@ -15,15 +15,6 @@ from database import ( SpipRubriques, ) -EXPORTTYPE: str = "md" - -ARTICLE_LINK = compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I) - - -def link_articles(text: str): - for match in ARTICLE_LINK.findall(text): - article = Article.select().where(Article.id_article == match[0]) - class Document(SpipDocuments): class Meta: @@ -43,6 +34,9 @@ class Document(SpipDocuments): ) +EXPORTTYPE: str = "md" + + class Article(SpipArticles): class Meta: table_name: str = "spip_articles" @@ -73,6 +67,8 @@ class Article(SpipArticles): ) for d in documents: self.texte = link_document(self.texte, d.id_document, d.titre, d.slug()) + # Internal (articles) links + self.text = link_articles(self.texte) return documents def slug(self, date: bool = False) -> str: @@ -154,6 +150,22 @@ def get_articles(section_id: int, limit: int = 10**6) -> ModelSelect: ) +ARTICLE_LINK = compile(r"\[(.*?)]\((?:art|article)([0-9]+)\)", I) + + +def link_articles(text: str): + for match in ARTICLE_LINK.finditer(text): + article = Article.get(Article.id_article == match.group(2)) + if len(match.group(1)) > 0: + title: str = match.group(1) + else: + title: str = article.titre + text = text.replace( + match.group(0), f"[{title}]({article.slug()}/{article.filename()})" + ) + return text + + class Rubrique(SpipRubriques): class Meta: table_name: str = "spip_rubriques" @@ -178,6 +190,8 @@ class Rubrique(SpipRubriques): ) for d in documents: self.texte = link_document(self.texte, d.id_document, d.titre, d.slug()) + # Internal (articles) links + self.text = link_articles(self.texte) return documents def slug(self, date: bool = False) -> str: