diff --git a/spip2md/converter.py b/spip2md/converter.py index f221afc..0e56608 100644 --- a/spip2md/converter.py +++ b/spip2md/converter.py @@ -236,6 +236,22 @@ iso_to_utf = ( "ç", r"ç", ), + ( # Fix UTF-8 « that was interpreted as ISO 8859-1 + "«", + r"«", + ), + ( # Fix UTF-8 » that was interpreted as ISO 8859-1 + "»", + r"»", + ), + ( # Fix UTF-8 ° that was interpreted as ISO 8859-1 + "°", + r"°", + ), + ( # Fix UTF-8 nbsp that was interpreted as ISO 8859-1 + " ", + r" ", + ), ( # Fix UTF-8 í that was interpreted as ISO 8859-1 "iÌ\u0081", r"í", @@ -290,9 +306,14 @@ def highlight_unknown_chars(text: str, pre: str, post: str) -> str: return text -def get_unknown_chars(text: str) -> list[str]: +def get_unknown_chars(text: str, context_length: int = 20) -> list[str]: errors: list[str] = [] + context: str = r".{0," + str(context_length) + r"}" for char in unknown_iso: - for match in finditer(r".{0-20}" + char + r".*(?=\r?\n|$)", text): + matches = finditer( + context + r"(?=" + char + r")" + char + r".*?(?=\r?\n|$)", + text, + ) + for match in matches: errors.append(match.group()) return errors diff --git a/spip2md/main.py b/spip2md/main.py index e5f841d..1b907f4 100755 --- a/spip2md/main.py +++ b/spip2md/main.py @@ -89,7 +89,6 @@ if __name__ == "__main__": # Following is executed only if script is directly e with open(articlepath, "w") as f: f.write(article.get_content()) # Store articles with unknown characters - print(f"UNKNOWN CHARS {get_unknown_chars(article.text)}") if len(get_unknown_chars(article.text)) > 0: unknown_chars_articles.append(article) # Print the outputted file’s path when finished exporting the article @@ -112,6 +111,6 @@ if __name__ == "__main__": # Following is executed only if script is directly e ) # Print the context in which the unknown characters are found for text in unknown_chars_apparitions: - print(f" {BOLD}…{RESET} " + highlight_unknown_chars(text, R, RESET)) + print(f" {BOLD}…{RESET} " + highlight_unknown_chars(text, R, RESET) + " …") db.close() # Close the connection with the database