fix encoding replacing

This commit is contained in:
Guilhem Fauré 2023-05-16 16:08:03 +02:00
parent e1c8bd4b2e
commit 619376003f
2 changed files with 6 additions and 5 deletions

View File

@ -258,6 +258,7 @@ unknown_iso = (
r"∆", # unknown â^† r"∆", # unknown â^†
) )
# Define terminal escape sequences to stylize output, regex escaped # Define terminal escape sequences to stylize output, regex escaped
RED: str = "\033[91m" RED: str = "\033[91m"
BOLD: str = "\033[1m" BOLD: str = "\033[1m"
@ -268,7 +269,7 @@ def convert_body(text: str) -> str:
for spip, markdown in spip_to_markdown: for spip, markdown in spip_to_markdown:
text = spip.sub(markdown, text) text = spip.sub(markdown, text)
for iso, utf in iso_to_utf: for iso, utf in iso_to_utf:
text.replace(iso, utf) text = text.replace(iso, utf)
return text return text
@ -276,7 +277,7 @@ def convert_meta(text: str) -> str:
for spip, metadata in spip_to_text: for spip, metadata in spip_to_text:
text = spip.sub(metadata, text) text = spip.sub(metadata, text)
for iso, utf in iso_to_utf: for iso, utf in iso_to_utf:
text.replace(iso, utf) text = text.replace(iso, utf)
return text return text
@ -289,11 +290,11 @@ def remove_unknown_chars(text: str) -> str:
def highlight_unknown_chars(text: str) -> str: def highlight_unknown_chars(text: str) -> str:
# Highlight in COLOR unknown chars in text # Highlight in COLOR unknown chars in text
for char in unknown_iso: for char in unknown_iso:
for match in finditer(char, text): for match in finditer("(" + char + ")+", text):
text = ( text = (
text[: match.start()] text[: match.start()]
+ RED + RED
+ BOLD # + BOLD
+ match.group() + match.group()
+ RESET + RESET
+ text[match.end() :] + text[match.end() :]

View File

@ -62,7 +62,7 @@ for article in unknown_chars_articles:
print( print(
f"\n{BOLD}{nb}{RESET} unknown character{s} " f"\n{BOLD}{nb}{RESET} unknown character{s} "
+ f"detected in article {BOLD}{article.id}{RESET}" + f"detected in article {BOLD}{article.id}{RESET}"
+ f"\n{BOLD}·{RESET} " + f"\n{BOLD}Title:{RESET} "
+ highlight_unknown_chars(article.title) + highlight_unknown_chars(article.title)
) )
for text in unknown_chars_apparitions: for text in unknown_chars_apparitions: