<multi> blocks & unknown chars logging in spip2md.log

This commit is contained in:
Guilhem Fauré 2023-05-30 12:16:58 +02:00
parent 93fc0862d6
commit d20976c59d
4 changed files with 45 additions and 56 deletions

View File

@ -1,7 +1,6 @@
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
# Top level functions
import sys
from os import makedirs
from os import makedirs, remove
from shutil import rmtree
from spip2md.config import CFG
@ -49,64 +48,23 @@ def main(*argv):
if CFG.clear_output:
rmtree(CFG.output_dir, True)
makedirs(CFG.output_dir, exist_ok=True)
# Clear the log file
# if CFG.clear_log:
# remove(CFG.logfile)
# Get the virtual id=0 section
root: Rubrique = RootRubrique()
# Write everything while printing the output human-readably
branches, leaves = count_output(root.write_tree(CFG.output_dir))
# End, summary message
print(
f"""
DB.close() # Close the connection with the database
print( # End, summary message
f"""\
Exported a total of {esc(BOLD)}{leaves}{esc()} Markdown files, \
stored into {esc(BOLD)}{branches}{esc()} directories"""
)
# print() # Break line between export & unknown characters warning
# Warn about each article that contains unknown(s) character(s)
# TODO do it with Python warnings
DB.close() # Close the connection with the database
r""" OLD CODE
# Print the detected unknown chars in article in their context but highlighted
def warn_unknown_chars(article: Article) -> None:
# Print the title of the article in which there is unknown characters
# & the number of them
unknown_chars_apparitions: list[str] = unknown_chars_context(article.texte)
nb: int = len(unknown_chars_apparitions)
s: str = "s" if nb > 1 else ""
style(f"{nb}")
print(f" unknown character{s} in", end="")
style(f" {article.lang} ")
highlight(article.titre, *unknown_chars(article.titre))
print() # Break line
# Print the context in which the unknown characters are found
for text in unknown_chars_apparitions:
style("")
highlight(text, *unknown_chars(text))
style("\n")
print() # Break line
# Return a list of tuples giving the start and end of unknown substring in text
def unknown_chars(text: str) -> list[tuple[int, int]]:
positions: list[tuple[int, int]] = []
for char in UNKNOWN_ISO:
for match in finditer("(" + char + ")+", text):
positions.append((match.start(), match.end()))
return positions
# Return strings with unknown chards found in text, surrounded by context_length chars
def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
errors: list[str] = []
context: str = r".{0," + str(context_length) + r"}"
for char in UNKNOWN_ISO:
matches = finditer(
context + r"(?=" + char + r")" + char + context,
text,
)
for match in matches:
errors.append(match.group())
return errors
"""
# Warn about issued warnings in log file
print(f"\nThere might be warnings in {esc(BOLD)}{CFG.logfile}{esc()}")

View File

@ -22,10 +22,12 @@ class Configuration:
output_dir: str = "output/"
data_dir: str = "data/"
clear_output: bool = False
clear_log: bool = True
prepend_h1: bool = True
export_filetype: str = "md"
max_articles_export: int = 1000 # TODO reimplement with recursion
max_sections_export: int = 500 # TODO reimplement with recursion
logfile: str = "spip2md.log"
def __init__(self, config_file: Optional[str] = None):
if config_file is not None:

View File

@ -114,7 +114,9 @@ DOCUMENT_LINK_REPL = r"\1[\2{}]({})"
# Multi language block, to be further processed per lang
MULTILANG_BLOCK = compile(r"<multi>(.+?)<\/multi>", S | I)
MULTILANGS = compile(r"\[([a-zA-Z\-]{2,6})\]\s*(.+?)(?=\[[a-zA-Z\-]{2,6}\]|$)", S | I)
MULTILANGS = compile(
r"\[([a-zA-Z\-]{2,6})\]\s*(.+?)\s*(?=\[[a-zA-Z\-]{2,6}\]|$)", S | I
)
# WARNING probably useless text in metadata fields, to be removed
BLOAT = (

View File

@ -1,7 +1,8 @@
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
import logging
from os import makedirs
from os.path import basename, splitext
from re import finditer, sub
from re import finditer, search, sub
from shutil import copyfile
from typing import Any, Match, Optional
@ -33,6 +34,9 @@ from spip2md.regexmap import (
)
from spip2md.style import BLUE, BOLD, GREEN, WARNING_STYLE, YELLOW, esc
# Output logs to logfile
logging.basicConfig(filename=CFG.logfile, encoding="utf-8")
class SpipWritable:
term_color: int
@ -57,13 +61,28 @@ class SpipWritable:
first_lang = lang.group(2)
else:
pass
# print("Found other language for", first_lang, ":", lang.groups())
logging.warning(
f"Ignored {lang.group(1)} translation of {first_lang[:40]}: "
+ lang.group(2)[:40],
)
return first_lang
return MULTILANG_BLOCK.sub(replace_lang, text)
# Apply different mappings to a text field, like SPIP to Markdown or encoding
def convert(self, text: Optional[str]) -> str:
# Return unknown char surrounded by context_length chars
def unknown_chars_context(text: str, char: str, context_len: int = 24) -> str:
context: str = r".{0," + str(context_len) + r"}"
match = search(
context + r"(?=" + char + r")" + char + context,
text,
)
if match is not None:
return match.group()
else:
return char
if text is not None and len(text) > 0:
for spip, markdown in SPIP_MARKDOWN:
text = spip.sub(markdown, text)
@ -71,6 +90,14 @@ class SpipWritable:
text = bloat.sub("", text)
for iso, utf in ISO_UTF:
text = text.replace(iso, utf)
for char in UNKNOWN_ISO:
lastend: int = 0
for match in finditer("(" + char + ")+", text):
context: str = unknown_chars_context(text[lastend:], char)
logging.warn(
f"Unknown char {char} found in {self.titre[:40]} at: {context}"
)
lastend = match.end()
text = self.translate(text)
else:
return ""