<multi> blocks & unknown chars logging in spip2md.log
This commit is contained in:
parent
93fc0862d6
commit
d20976c59d
@ -1,7 +1,6 @@
|
||||
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
|
||||
# Top level functions
|
||||
import sys
|
||||
from os import makedirs
|
||||
from os import makedirs, remove
|
||||
from shutil import rmtree
|
||||
|
||||
from spip2md.config import CFG
|
||||
@ -49,64 +48,23 @@ def main(*argv):
|
||||
if CFG.clear_output:
|
||||
rmtree(CFG.output_dir, True)
|
||||
makedirs(CFG.output_dir, exist_ok=True)
|
||||
# Clear the log file
|
||||
# if CFG.clear_log:
|
||||
# remove(CFG.logfile)
|
||||
|
||||
# Get the virtual id=0 section
|
||||
root: Rubrique = RootRubrique()
|
||||
|
||||
# Write everything while printing the output human-readably
|
||||
branches, leaves = count_output(root.write_tree(CFG.output_dir))
|
||||
# End, summary message
|
||||
print(
|
||||
f"""
|
||||
|
||||
DB.close() # Close the connection with the database
|
||||
|
||||
print( # End, summary message
|
||||
f"""\
|
||||
Exported a total of {esc(BOLD)}{leaves}{esc()} Markdown files, \
|
||||
stored into {esc(BOLD)}{branches}{esc()} directories"""
|
||||
)
|
||||
|
||||
# print() # Break line between export & unknown characters warning
|
||||
# Warn about each article that contains unknown(s) character(s)
|
||||
# TODO do it with Python warnings
|
||||
|
||||
DB.close() # Close the connection with the database
|
||||
|
||||
|
||||
r""" OLD CODE
|
||||
# Print the detected unknown chars in article in their context but highlighted
|
||||
def warn_unknown_chars(article: Article) -> None:
|
||||
# Print the title of the article in which there is unknown characters
|
||||
# & the number of them
|
||||
unknown_chars_apparitions: list[str] = unknown_chars_context(article.texte)
|
||||
nb: int = len(unknown_chars_apparitions)
|
||||
s: str = "s" if nb > 1 else ""
|
||||
style(f"{nb}")
|
||||
print(f" unknown character{s} in", end="")
|
||||
style(f" {article.lang} ")
|
||||
highlight(article.titre, *unknown_chars(article.titre))
|
||||
print() # Break line
|
||||
# Print the context in which the unknown characters are found
|
||||
for text in unknown_chars_apparitions:
|
||||
style(" … ")
|
||||
highlight(text, *unknown_chars(text))
|
||||
style(" … \n")
|
||||
print() # Break line
|
||||
|
||||
# Return a list of tuples giving the start and end of unknown substring in text
|
||||
def unknown_chars(text: str) -> list[tuple[int, int]]:
|
||||
positions: list[tuple[int, int]] = []
|
||||
for char in UNKNOWN_ISO:
|
||||
for match in finditer("(" + char + ")+", text):
|
||||
positions.append((match.start(), match.end()))
|
||||
return positions
|
||||
|
||||
# Return strings with unknown chards found in text, surrounded by context_length chars
|
||||
def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
|
||||
errors: list[str] = []
|
||||
context: str = r".{0," + str(context_length) + r"}"
|
||||
for char in UNKNOWN_ISO:
|
||||
matches = finditer(
|
||||
context + r"(?=" + char + r")" + char + context,
|
||||
text,
|
||||
)
|
||||
for match in matches:
|
||||
errors.append(match.group())
|
||||
return errors
|
||||
"""
|
||||
# Warn about issued warnings in log file
|
||||
print(f"\nThere might be warnings in {esc(BOLD)}{CFG.logfile}{esc()}")
|
||||
|
@ -22,10 +22,12 @@ class Configuration:
|
||||
output_dir: str = "output/"
|
||||
data_dir: str = "data/"
|
||||
clear_output: bool = False
|
||||
clear_log: bool = True
|
||||
prepend_h1: bool = True
|
||||
export_filetype: str = "md"
|
||||
max_articles_export: int = 1000 # TODO reimplement with recursion
|
||||
max_sections_export: int = 500 # TODO reimplement with recursion
|
||||
logfile: str = "spip2md.log"
|
||||
|
||||
def __init__(self, config_file: Optional[str] = None):
|
||||
if config_file is not None:
|
||||
|
@ -114,7 +114,9 @@ DOCUMENT_LINK_REPL = r"\1[\2{}]({})"
|
||||
|
||||
# Multi language block, to be further processed per lang
|
||||
MULTILANG_BLOCK = compile(r"<multi>(.+?)<\/multi>", S | I)
|
||||
MULTILANGS = compile(r"\[([a-zA-Z\-]{2,6})\]\s*(.+?)(?=\[[a-zA-Z\-]{2,6}\]|$)", S | I)
|
||||
MULTILANGS = compile(
|
||||
r"\[([a-zA-Z\-]{2,6})\]\s*(.+?)\s*(?=\[[a-zA-Z\-]{2,6}\]|$)", S | I
|
||||
)
|
||||
|
||||
# WARNING probably useless text in metadata fields, to be removed
|
||||
BLOAT = (
|
||||
|
@ -1,7 +1,8 @@
|
||||
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
|
||||
import logging
|
||||
from os import makedirs
|
||||
from os.path import basename, splitext
|
||||
from re import finditer, sub
|
||||
from re import finditer, search, sub
|
||||
from shutil import copyfile
|
||||
from typing import Any, Match, Optional
|
||||
|
||||
@ -33,6 +34,9 @@ from spip2md.regexmap import (
|
||||
)
|
||||
from spip2md.style import BLUE, BOLD, GREEN, WARNING_STYLE, YELLOW, esc
|
||||
|
||||
# Output logs to logfile
|
||||
logging.basicConfig(filename=CFG.logfile, encoding="utf-8")
|
||||
|
||||
|
||||
class SpipWritable:
|
||||
term_color: int
|
||||
@ -57,13 +61,28 @@ class SpipWritable:
|
||||
first_lang = lang.group(2)
|
||||
else:
|
||||
pass
|
||||
# print("Found other language for", first_lang, ":", lang.groups())
|
||||
logging.warning(
|
||||
f"Ignored {lang.group(1)} translation of {first_lang[:40]}: "
|
||||
+ lang.group(2)[:40],
|
||||
)
|
||||
return first_lang
|
||||
|
||||
return MULTILANG_BLOCK.sub(replace_lang, text)
|
||||
|
||||
# Apply different mappings to a text field, like SPIP to Markdown or encoding
|
||||
def convert(self, text: Optional[str]) -> str:
|
||||
# Return unknown char surrounded by context_length chars
|
||||
def unknown_chars_context(text: str, char: str, context_len: int = 24) -> str:
|
||||
context: str = r".{0," + str(context_len) + r"}"
|
||||
match = search(
|
||||
context + r"(?=" + char + r")" + char + context,
|
||||
text,
|
||||
)
|
||||
if match is not None:
|
||||
return match.group()
|
||||
else:
|
||||
return char
|
||||
|
||||
if text is not None and len(text) > 0:
|
||||
for spip, markdown in SPIP_MARKDOWN:
|
||||
text = spip.sub(markdown, text)
|
||||
@ -71,6 +90,14 @@ class SpipWritable:
|
||||
text = bloat.sub("", text)
|
||||
for iso, utf in ISO_UTF:
|
||||
text = text.replace(iso, utf)
|
||||
for char in UNKNOWN_ISO:
|
||||
lastend: int = 0
|
||||
for match in finditer("(" + char + ")+", text):
|
||||
context: str = unknown_chars_context(text[lastend:], char)
|
||||
logging.warn(
|
||||
f"Unknown char {char} found in {self.titre[:40]} at: {context}"
|
||||
)
|
||||
lastend = match.end()
|
||||
text = self.translate(text)
|
||||
else:
|
||||
return ""
|
||||
|
Loading…
Reference in New Issue
Block a user