fix unknown chars warning + better context
This commit is contained in:
parent
a83ec1da3c
commit
477037573a
@ -236,6 +236,22 @@ iso_to_utf = (
|
||||
"ç",
|
||||
r"ç",
|
||||
),
|
||||
( # Fix UTF-8 « that was interpreted as ISO 8859-1
|
||||
"«",
|
||||
r"«",
|
||||
),
|
||||
( # Fix UTF-8 » that was interpreted as ISO 8859-1
|
||||
"»",
|
||||
r"»",
|
||||
),
|
||||
( # Fix UTF-8 ° that was interpreted as ISO 8859-1
|
||||
"°",
|
||||
r"°",
|
||||
),
|
||||
( # Fix UTF-8 nbsp that was interpreted as ISO 8859-1
|
||||
"Â ",
|
||||
r" ",
|
||||
),
|
||||
( # Fix UTF-8 í that was interpreted as ISO 8859-1
|
||||
"iÌ\u0081",
|
||||
r"í",
|
||||
@ -290,9 +306,14 @@ def highlight_unknown_chars(text: str, pre: str, post: str) -> str:
|
||||
return text
|
||||
|
||||
|
||||
def get_unknown_chars(text: str) -> list[str]:
|
||||
def get_unknown_chars(text: str, context_length: int = 20) -> list[str]:
|
||||
errors: list[str] = []
|
||||
context: str = r".{0," + str(context_length) + r"}"
|
||||
for char in unknown_iso:
|
||||
for match in finditer(r".{0-20}" + char + r".*(?=\r?\n|$)", text):
|
||||
matches = finditer(
|
||||
context + r"(?=" + char + r")" + char + r".*?(?=\r?\n|$)",
|
||||
text,
|
||||
)
|
||||
for match in matches:
|
||||
errors.append(match.group())
|
||||
return errors
|
||||
|
@ -89,7 +89,6 @@ if __name__ == "__main__": # Following is executed only if script is directly e
|
||||
with open(articlepath, "w") as f:
|
||||
f.write(article.get_content())
|
||||
# Store articles with unknown characters
|
||||
print(f"UNKNOWN CHARS {get_unknown_chars(article.text)}")
|
||||
if len(get_unknown_chars(article.text)) > 0:
|
||||
unknown_chars_articles.append(article)
|
||||
# Print the outputted file’s path when finished exporting the article
|
||||
@ -112,6 +111,6 @@ if __name__ == "__main__": # Following is executed only if script is directly e
|
||||
)
|
||||
# Print the context in which the unknown characters are found
|
||||
for text in unknown_chars_apparitions:
|
||||
print(f" {BOLD}…{RESET} " + highlight_unknown_chars(text, R, RESET))
|
||||
print(f" {BOLD}…{RESET} " + highlight_unknown_chars(text, R, RESET) + " …")
|
||||
|
||||
db.close() # Close the connection with the database
|
||||
|
Loading…
Reference in New Issue
Block a user