better unknown chars conversion

This commit is contained in:
Guilhem Fauré 2023-05-22 11:32:05 +02:00
parent 477037573a
commit 1dc7d72987
3 changed files with 18 additions and 9 deletions

View File

@ -1,5 +1,6 @@
# pyright: strict
from re import I, S, compile, finditer
from typing import Optional
# SPIP syntax to Markdown
spip_to_markdown = (
@ -274,7 +275,9 @@ unknown_iso = (
)
def convert_body(text: str) -> str:
def convert_body(text: Optional[str]) -> str:
if text is None:
return ""
for spip, markdown in spip_to_markdown:
text = spip.sub(markdown, text)
for iso, utf in iso_to_utf:
@ -282,7 +285,9 @@ def convert_body(text: str) -> str:
return text
def convert_meta(text: str) -> str:
def convert_meta(text: Optional[str]) -> str:
if text is None:
return ""
for spip, metadata in spip_to_text:
text = spip.sub(metadata, text)
for iso, utf in iso_to_utf:

View File

@ -27,7 +27,7 @@ class Item:
self.lang: str = item.lang
self.set_lang: bool = item.langue_choisie # TODO Why?
self.translation_key: int = item.id_trad
self.extra: str = item.extra # Probably unused
self.extra: str = convert_body(item.extra) # Probably unused
def get_slug(self, date: bool = False) -> str:
return slugify(f"{self.publication if date else ''}-{self.title}")
@ -64,7 +64,7 @@ class Item:
if len(self.text) > 0:
body += "\n\n" + self.text
# Same with an "extra" section
if self.extra is not None and len(self.extra) > 0:
if len(self.extra) > 0:
body += "\n\n# EXTRA\n\n" + self.extra
return body
@ -77,10 +77,10 @@ class Article(Item):
def __init__(self, article) -> None:
super().__init__(article)
self.id: int = article.id_article
self.surtitle: str = article.surtitre # Probably unused
self.subtitle: str = article.soustitre # Probably unused
self.caption: str = article.chapo # Probably unused
self.ps: str = article.ps # Probably unused
self.surtitle: str = convert_meta(article.surtitre) # Probably unused
self.subtitle: str = convert_meta(article.soustitre) # Probably unused
self.caption: str = convert_body(article.chapo) # Probably unused
self.ps: str = convert_body(article.ps) # Probably unused
self.update_2: str = article.date_modif # Probably unused duplicate of maj
self.creation: str = article.date_redac
self.forum: bool = article.accepter_forum # TODO Why?

View File

@ -111,6 +111,10 @@ if __name__ == "__main__": # Following is executed only if script is directly e
)
# Print the context in which the unknown characters are found
for text in unknown_chars_apparitions:
print(f" {BOLD}{RESET} " + highlight_unknown_chars(text, R, RESET) + "")
print(
f" {BOLD}{RESET} "
+ highlight_unknown_chars(text, R, RESET)
+ f" {BOLD}{RESET}"
)
db.close() # Close the connection with the database