fix unknown chars warning + better context

This commit is contained in:
Guilhem Fauré 2023-05-22 11:23:51 +02:00
parent a83ec1da3c
commit 477037573a
2 changed files with 24 additions and 4 deletions

View File

@ -236,6 +236,22 @@ iso_to_utf = (
"ç",
r"ç",
),
( # Fix UTF-8 « that was interpreted as ISO 8859-1
"«",
r"«",
),
( # Fix UTF-8 » that was interpreted as ISO 8859-1
"»",
r"»",
),
( # Fix UTF-8 ° that was interpreted as ISO 8859-1
"°",
r"°",
),
( # Fix UTF-8 nbsp that was interpreted as ISO 8859-1
" ",
r" ",
),
( # Fix UTF-8 í that was interpreted as ISO 8859-1
"\u0081",
r"í",
@ -290,9 +306,14 @@ def highlight_unknown_chars(text: str, pre: str, post: str) -> str:
return text
def get_unknown_chars(text: str) -> list[str]:
def get_unknown_chars(text: str, context_length: int = 20) -> list[str]:
errors: list[str] = []
context: str = r".{0," + str(context_length) + r"}"
for char in unknown_iso:
for match in finditer(r".{0-20}" + char + r".*(?=\r?\n|$)", text):
matches = finditer(
context + r"(?=" + char + r")" + char + r".*?(?=\r?\n|$)",
text,
)
for match in matches:
errors.append(match.group())
return errors

View File

@ -89,7 +89,6 @@ if __name__ == "__main__": # Following is executed only if script is directly e
with open(articlepath, "w") as f:
f.write(article.get_content())
# Store articles with unknown characters
print(f"UNKNOWN CHARS {get_unknown_chars(article.text)}")
if len(get_unknown_chars(article.text)) > 0:
unknown_chars_articles.append(article)
# Print the outputted files path when finished exporting the article
@ -112,6 +111,6 @@ if __name__ == "__main__": # Following is executed only if script is directly e
)
# Print the context in which the unknown characters are found
for text in unknown_chars_apparitions:
print(f" {BOLD}{RESET} " + highlight_unknown_chars(text, R, RESET))
print(f" {BOLD}{RESET} " + highlight_unknown_chars(text, R, RESET) + "")
db.close() # Close the connection with the database