fix unknown chars warning + better context
This commit is contained in:
parent
a83ec1da3c
commit
477037573a
@ -236,6 +236,22 @@ iso_to_utf = (
|
|||||||
"ç",
|
"ç",
|
||||||
r"ç",
|
r"ç",
|
||||||
),
|
),
|
||||||
|
( # Fix UTF-8 « that was interpreted as ISO 8859-1
|
||||||
|
"«",
|
||||||
|
r"«",
|
||||||
|
),
|
||||||
|
( # Fix UTF-8 » that was interpreted as ISO 8859-1
|
||||||
|
"»",
|
||||||
|
r"»",
|
||||||
|
),
|
||||||
|
( # Fix UTF-8 ° that was interpreted as ISO 8859-1
|
||||||
|
"°",
|
||||||
|
r"°",
|
||||||
|
),
|
||||||
|
( # Fix UTF-8 nbsp that was interpreted as ISO 8859-1
|
||||||
|
"Â ",
|
||||||
|
r" ",
|
||||||
|
),
|
||||||
( # Fix UTF-8 í that was interpreted as ISO 8859-1
|
( # Fix UTF-8 í that was interpreted as ISO 8859-1
|
||||||
"iÌ\u0081",
|
"iÌ\u0081",
|
||||||
r"í",
|
r"í",
|
||||||
@ -290,9 +306,14 @@ def highlight_unknown_chars(text: str, pre: str, post: str) -> str:
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def get_unknown_chars(text: str) -> list[str]:
|
def get_unknown_chars(text: str, context_length: int = 20) -> list[str]:
|
||||||
errors: list[str] = []
|
errors: list[str] = []
|
||||||
|
context: str = r".{0," + str(context_length) + r"}"
|
||||||
for char in unknown_iso:
|
for char in unknown_iso:
|
||||||
for match in finditer(r".{0-20}" + char + r".*(?=\r?\n|$)", text):
|
matches = finditer(
|
||||||
|
context + r"(?=" + char + r")" + char + r".*?(?=\r?\n|$)",
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
for match in matches:
|
||||||
errors.append(match.group())
|
errors.append(match.group())
|
||||||
return errors
|
return errors
|
||||||
|
@ -89,7 +89,6 @@ if __name__ == "__main__": # Following is executed only if script is directly e
|
|||||||
with open(articlepath, "w") as f:
|
with open(articlepath, "w") as f:
|
||||||
f.write(article.get_content())
|
f.write(article.get_content())
|
||||||
# Store articles with unknown characters
|
# Store articles with unknown characters
|
||||||
print(f"UNKNOWN CHARS {get_unknown_chars(article.text)}")
|
|
||||||
if len(get_unknown_chars(article.text)) > 0:
|
if len(get_unknown_chars(article.text)) > 0:
|
||||||
unknown_chars_articles.append(article)
|
unknown_chars_articles.append(article)
|
||||||
# Print the outputted file’s path when finished exporting the article
|
# Print the outputted file’s path when finished exporting the article
|
||||||
@ -112,6 +111,6 @@ if __name__ == "__main__": # Following is executed only if script is directly e
|
|||||||
)
|
)
|
||||||
# Print the context in which the unknown characters are found
|
# Print the context in which the unknown characters are found
|
||||||
for text in unknown_chars_apparitions:
|
for text in unknown_chars_apparitions:
|
||||||
print(f" {BOLD}…{RESET} " + highlight_unknown_chars(text, R, RESET))
|
print(f" {BOLD}…{RESET} " + highlight_unknown_chars(text, R, RESET) + " …")
|
||||||
|
|
||||||
db.close() # Close the connection with the database
|
db.close() # Close the connection with the database
|
||||||
|
Loading…
Reference in New Issue
Block a user