<multi> blocks & unknown chars logging in spip2md.log
This commit is contained in:
parent
93fc0862d6
commit
d20976c59d
@ -1,7 +1,6 @@
|
|||||||
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
|
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
|
||||||
# Top level functions
|
|
||||||
import sys
|
import sys
|
||||||
from os import makedirs
|
from os import makedirs, remove
|
||||||
from shutil import rmtree
|
from shutil import rmtree
|
||||||
|
|
||||||
from spip2md.config import CFG
|
from spip2md.config import CFG
|
||||||
@ -49,64 +48,23 @@ def main(*argv):
|
|||||||
if CFG.clear_output:
|
if CFG.clear_output:
|
||||||
rmtree(CFG.output_dir, True)
|
rmtree(CFG.output_dir, True)
|
||||||
makedirs(CFG.output_dir, exist_ok=True)
|
makedirs(CFG.output_dir, exist_ok=True)
|
||||||
|
# Clear the log file
|
||||||
|
# if CFG.clear_log:
|
||||||
|
# remove(CFG.logfile)
|
||||||
|
|
||||||
# Get the virtual id=0 section
|
# Get the virtual id=0 section
|
||||||
root: Rubrique = RootRubrique()
|
root: Rubrique = RootRubrique()
|
||||||
|
|
||||||
# Write everything while printing the output human-readably
|
# Write everything while printing the output human-readably
|
||||||
branches, leaves = count_output(root.write_tree(CFG.output_dir))
|
branches, leaves = count_output(root.write_tree(CFG.output_dir))
|
||||||
# End, summary message
|
|
||||||
print(
|
DB.close() # Close the connection with the database
|
||||||
f"""
|
|
||||||
|
print( # End, summary message
|
||||||
|
f"""\
|
||||||
Exported a total of {esc(BOLD)}{leaves}{esc()} Markdown files, \
|
Exported a total of {esc(BOLD)}{leaves}{esc()} Markdown files, \
|
||||||
stored into {esc(BOLD)}{branches}{esc()} directories"""
|
stored into {esc(BOLD)}{branches}{esc()} directories"""
|
||||||
)
|
)
|
||||||
|
|
||||||
# print() # Break line between export & unknown characters warning
|
# Warn about issued warnings in log file
|
||||||
# Warn about each article that contains unknown(s) character(s)
|
print(f"\nThere might be warnings in {esc(BOLD)}{CFG.logfile}{esc()}")
|
||||||
# TODO do it with Python warnings
|
|
||||||
|
|
||||||
DB.close() # Close the connection with the database
|
|
||||||
|
|
||||||
|
|
||||||
r""" OLD CODE
|
|
||||||
# Print the detected unknown chars in article in their context but highlighted
|
|
||||||
def warn_unknown_chars(article: Article) -> None:
|
|
||||||
# Print the title of the article in which there is unknown characters
|
|
||||||
# & the number of them
|
|
||||||
unknown_chars_apparitions: list[str] = unknown_chars_context(article.texte)
|
|
||||||
nb: int = len(unknown_chars_apparitions)
|
|
||||||
s: str = "s" if nb > 1 else ""
|
|
||||||
style(f"{nb}")
|
|
||||||
print(f" unknown character{s} in", end="")
|
|
||||||
style(f" {article.lang} ")
|
|
||||||
highlight(article.titre, *unknown_chars(article.titre))
|
|
||||||
print() # Break line
|
|
||||||
# Print the context in which the unknown characters are found
|
|
||||||
for text in unknown_chars_apparitions:
|
|
||||||
style(" … ")
|
|
||||||
highlight(text, *unknown_chars(text))
|
|
||||||
style(" … \n")
|
|
||||||
print() # Break line
|
|
||||||
|
|
||||||
# Return a list of tuples giving the start and end of unknown substring in text
|
|
||||||
def unknown_chars(text: str) -> list[tuple[int, int]]:
|
|
||||||
positions: list[tuple[int, int]] = []
|
|
||||||
for char in UNKNOWN_ISO:
|
|
||||||
for match in finditer("(" + char + ")+", text):
|
|
||||||
positions.append((match.start(), match.end()))
|
|
||||||
return positions
|
|
||||||
|
|
||||||
# Return strings with unknown chards found in text, surrounded by context_length chars
|
|
||||||
def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
|
|
||||||
errors: list[str] = []
|
|
||||||
context: str = r".{0," + str(context_length) + r"}"
|
|
||||||
for char in UNKNOWN_ISO:
|
|
||||||
matches = finditer(
|
|
||||||
context + r"(?=" + char + r")" + char + context,
|
|
||||||
text,
|
|
||||||
)
|
|
||||||
for match in matches:
|
|
||||||
errors.append(match.group())
|
|
||||||
return errors
|
|
||||||
"""
|
|
||||||
|
@ -22,10 +22,12 @@ class Configuration:
|
|||||||
output_dir: str = "output/"
|
output_dir: str = "output/"
|
||||||
data_dir: str = "data/"
|
data_dir: str = "data/"
|
||||||
clear_output: bool = False
|
clear_output: bool = False
|
||||||
|
clear_log: bool = True
|
||||||
prepend_h1: bool = True
|
prepend_h1: bool = True
|
||||||
export_filetype: str = "md"
|
export_filetype: str = "md"
|
||||||
max_articles_export: int = 1000 # TODO reimplement with recursion
|
max_articles_export: int = 1000 # TODO reimplement with recursion
|
||||||
max_sections_export: int = 500 # TODO reimplement with recursion
|
max_sections_export: int = 500 # TODO reimplement with recursion
|
||||||
|
logfile: str = "spip2md.log"
|
||||||
|
|
||||||
def __init__(self, config_file: Optional[str] = None):
|
def __init__(self, config_file: Optional[str] = None):
|
||||||
if config_file is not None:
|
if config_file is not None:
|
||||||
|
@ -114,7 +114,9 @@ DOCUMENT_LINK_REPL = r"\1[\2{}]({})"
|
|||||||
|
|
||||||
# Multi language block, to be further processed per lang
|
# Multi language block, to be further processed per lang
|
||||||
MULTILANG_BLOCK = compile(r"<multi>(.+?)<\/multi>", S | I)
|
MULTILANG_BLOCK = compile(r"<multi>(.+?)<\/multi>", S | I)
|
||||||
MULTILANGS = compile(r"\[([a-zA-Z\-]{2,6})\]\s*(.+?)(?=\[[a-zA-Z\-]{2,6}\]|$)", S | I)
|
MULTILANGS = compile(
|
||||||
|
r"\[([a-zA-Z\-]{2,6})\]\s*(.+?)\s*(?=\[[a-zA-Z\-]{2,6}\]|$)", S | I
|
||||||
|
)
|
||||||
|
|
||||||
# WARNING probably useless text in metadata fields, to be removed
|
# WARNING probably useless text in metadata fields, to be removed
|
||||||
BLOAT = (
|
BLOAT = (
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
|
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
|
||||||
|
import logging
|
||||||
from os import makedirs
|
from os import makedirs
|
||||||
from os.path import basename, splitext
|
from os.path import basename, splitext
|
||||||
from re import finditer, sub
|
from re import finditer, search, sub
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
from typing import Any, Match, Optional
|
from typing import Any, Match, Optional
|
||||||
|
|
||||||
@ -33,6 +34,9 @@ from spip2md.regexmap import (
|
|||||||
)
|
)
|
||||||
from spip2md.style import BLUE, BOLD, GREEN, WARNING_STYLE, YELLOW, esc
|
from spip2md.style import BLUE, BOLD, GREEN, WARNING_STYLE, YELLOW, esc
|
||||||
|
|
||||||
|
# Output logs to logfile
|
||||||
|
logging.basicConfig(filename=CFG.logfile, encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
class SpipWritable:
|
class SpipWritable:
|
||||||
term_color: int
|
term_color: int
|
||||||
@ -57,13 +61,28 @@ class SpipWritable:
|
|||||||
first_lang = lang.group(2)
|
first_lang = lang.group(2)
|
||||||
else:
|
else:
|
||||||
pass
|
pass
|
||||||
# print("Found other language for", first_lang, ":", lang.groups())
|
logging.warning(
|
||||||
|
f"Ignored {lang.group(1)} translation of {first_lang[:40]}: "
|
||||||
|
+ lang.group(2)[:40],
|
||||||
|
)
|
||||||
return first_lang
|
return first_lang
|
||||||
|
|
||||||
return MULTILANG_BLOCK.sub(replace_lang, text)
|
return MULTILANG_BLOCK.sub(replace_lang, text)
|
||||||
|
|
||||||
# Apply different mappings to a text field, like SPIP to Markdown or encoding
|
# Apply different mappings to a text field, like SPIP to Markdown or encoding
|
||||||
def convert(self, text: Optional[str]) -> str:
|
def convert(self, text: Optional[str]) -> str:
|
||||||
|
# Return unknown char surrounded by context_length chars
|
||||||
|
def unknown_chars_context(text: str, char: str, context_len: int = 24) -> str:
|
||||||
|
context: str = r".{0," + str(context_len) + r"}"
|
||||||
|
match = search(
|
||||||
|
context + r"(?=" + char + r")" + char + context,
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
if match is not None:
|
||||||
|
return match.group()
|
||||||
|
else:
|
||||||
|
return char
|
||||||
|
|
||||||
if text is not None and len(text) > 0:
|
if text is not None and len(text) > 0:
|
||||||
for spip, markdown in SPIP_MARKDOWN:
|
for spip, markdown in SPIP_MARKDOWN:
|
||||||
text = spip.sub(markdown, text)
|
text = spip.sub(markdown, text)
|
||||||
@ -71,6 +90,14 @@ class SpipWritable:
|
|||||||
text = bloat.sub("", text)
|
text = bloat.sub("", text)
|
||||||
for iso, utf in ISO_UTF:
|
for iso, utf in ISO_UTF:
|
||||||
text = text.replace(iso, utf)
|
text = text.replace(iso, utf)
|
||||||
|
for char in UNKNOWN_ISO:
|
||||||
|
lastend: int = 0
|
||||||
|
for match in finditer("(" + char + ")+", text):
|
||||||
|
context: str = unknown_chars_context(text[lastend:], char)
|
||||||
|
logging.warn(
|
||||||
|
f"Unknown char {char} found in {self.titre[:40]} at: {context}"
|
||||||
|
)
|
||||||
|
lastend = match.end()
|
||||||
text = self.translate(text)
|
text = self.translate(text)
|
||||||
else:
|
else:
|
||||||
return ""
|
return ""
|
||||||
|
Loading…
Reference in New Issue
Block a user