<multi> blocks & unknown chars logging in spip2md.log

This commit is contained in:
Guilhem Fauré 2023-05-30 12:16:58 +02:00
parent 93fc0862d6
commit d20976c59d
4 changed files with 45 additions and 56 deletions

View File

@ -1,7 +1,6 @@
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
# Top level functions
import sys import sys
from os import makedirs from os import makedirs, remove
from shutil import rmtree from shutil import rmtree
from spip2md.config import CFG from spip2md.config import CFG
@ -49,64 +48,23 @@ def main(*argv):
if CFG.clear_output: if CFG.clear_output:
rmtree(CFG.output_dir, True) rmtree(CFG.output_dir, True)
makedirs(CFG.output_dir, exist_ok=True) makedirs(CFG.output_dir, exist_ok=True)
# Clear the log file
# if CFG.clear_log:
# remove(CFG.logfile)
# Get the virtual id=0 section # Get the virtual id=0 section
root: Rubrique = RootRubrique() root: Rubrique = RootRubrique()
# Write everything while printing the output human-readably # Write everything while printing the output human-readably
branches, leaves = count_output(root.write_tree(CFG.output_dir)) branches, leaves = count_output(root.write_tree(CFG.output_dir))
# End, summary message
print( DB.close() # Close the connection with the database
f"""
print( # End, summary message
f"""\
Exported a total of {esc(BOLD)}{leaves}{esc()} Markdown files, \ Exported a total of {esc(BOLD)}{leaves}{esc()} Markdown files, \
stored into {esc(BOLD)}{branches}{esc()} directories""" stored into {esc(BOLD)}{branches}{esc()} directories"""
) )
# print() # Break line between export & unknown characters warning # Warn about issued warnings in log file
# Warn about each article that contains unknown(s) character(s) print(f"\nThere might be warnings in {esc(BOLD)}{CFG.logfile}{esc()}")
# TODO do it with Python warnings
DB.close() # Close the connection with the database
r""" OLD CODE
# Print the detected unknown chars in article in their context but highlighted
def warn_unknown_chars(article: Article) -> None:
# Print the title of the article in which there is unknown characters
# & the number of them
unknown_chars_apparitions: list[str] = unknown_chars_context(article.texte)
nb: int = len(unknown_chars_apparitions)
s: str = "s" if nb > 1 else ""
style(f"{nb}")
print(f" unknown character{s} in", end="")
style(f" {article.lang} ")
highlight(article.titre, *unknown_chars(article.titre))
print() # Break line
# Print the context in which the unknown characters are found
for text in unknown_chars_apparitions:
style("")
highlight(text, *unknown_chars(text))
style("\n")
print() # Break line
# Return a list of tuples giving the start and end of unknown substring in text
def unknown_chars(text: str) -> list[tuple[int, int]]:
positions: list[tuple[int, int]] = []
for char in UNKNOWN_ISO:
for match in finditer("(" + char + ")+", text):
positions.append((match.start(), match.end()))
return positions
# Return strings with unknown chards found in text, surrounded by context_length chars
def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
errors: list[str] = []
context: str = r".{0," + str(context_length) + r"}"
for char in UNKNOWN_ISO:
matches = finditer(
context + r"(?=" + char + r")" + char + context,
text,
)
for match in matches:
errors.append(match.group())
return errors
"""

View File

@ -22,10 +22,12 @@ class Configuration:
output_dir: str = "output/" output_dir: str = "output/"
data_dir: str = "data/" data_dir: str = "data/"
clear_output: bool = False clear_output: bool = False
clear_log: bool = True
prepend_h1: bool = True prepend_h1: bool = True
export_filetype: str = "md" export_filetype: str = "md"
max_articles_export: int = 1000 # TODO reimplement with recursion max_articles_export: int = 1000 # TODO reimplement with recursion
max_sections_export: int = 500 # TODO reimplement with recursion max_sections_export: int = 500 # TODO reimplement with recursion
logfile: str = "spip2md.log"
def __init__(self, config_file: Optional[str] = None): def __init__(self, config_file: Optional[str] = None):
if config_file is not None: if config_file is not None:

View File

@ -114,7 +114,9 @@ DOCUMENT_LINK_REPL = r"\1[\2{}]({})"
# Multi language block, to be further processed per lang # Multi language block, to be further processed per lang
MULTILANG_BLOCK = compile(r"<multi>(.+?)<\/multi>", S | I) MULTILANG_BLOCK = compile(r"<multi>(.+?)<\/multi>", S | I)
MULTILANGS = compile(r"\[([a-zA-Z\-]{2,6})\]\s*(.+?)(?=\[[a-zA-Z\-]{2,6}\]|$)", S | I) MULTILANGS = compile(
r"\[([a-zA-Z\-]{2,6})\]\s*(.+?)\s*(?=\[[a-zA-Z\-]{2,6}\]|$)", S | I
)
# WARNING probably useless text in metadata fields, to be removed # WARNING probably useless text in metadata fields, to be removed
BLOAT = ( BLOAT = (

View File

@ -1,7 +1,8 @@
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
import logging
from os import makedirs from os import makedirs
from os.path import basename, splitext from os.path import basename, splitext
from re import finditer, sub from re import finditer, search, sub
from shutil import copyfile from shutil import copyfile
from typing import Any, Match, Optional from typing import Any, Match, Optional
@ -33,6 +34,9 @@ from spip2md.regexmap import (
) )
from spip2md.style import BLUE, BOLD, GREEN, WARNING_STYLE, YELLOW, esc from spip2md.style import BLUE, BOLD, GREEN, WARNING_STYLE, YELLOW, esc
# Output logs to logfile
logging.basicConfig(filename=CFG.logfile, encoding="utf-8")
class SpipWritable: class SpipWritable:
term_color: int term_color: int
@ -57,13 +61,28 @@ class SpipWritable:
first_lang = lang.group(2) first_lang = lang.group(2)
else: else:
pass pass
# print("Found other language for", first_lang, ":", lang.groups()) logging.warning(
f"Ignored {lang.group(1)} translation of {first_lang[:40]}: "
+ lang.group(2)[:40],
)
return first_lang return first_lang
return MULTILANG_BLOCK.sub(replace_lang, text) return MULTILANG_BLOCK.sub(replace_lang, text)
# Apply different mappings to a text field, like SPIP to Markdown or encoding # Apply different mappings to a text field, like SPIP to Markdown or encoding
def convert(self, text: Optional[str]) -> str: def convert(self, text: Optional[str]) -> str:
# Return unknown char surrounded by context_length chars
def unknown_chars_context(text: str, char: str, context_len: int = 24) -> str:
context: str = r".{0," + str(context_len) + r"}"
match = search(
context + r"(?=" + char + r")" + char + context,
text,
)
if match is not None:
return match.group()
else:
return char
if text is not None and len(text) > 0: if text is not None and len(text) > 0:
for spip, markdown in SPIP_MARKDOWN: for spip, markdown in SPIP_MARKDOWN:
text = spip.sub(markdown, text) text = spip.sub(markdown, text)
@ -71,6 +90,14 @@ class SpipWritable:
text = bloat.sub("", text) text = bloat.sub("", text)
for iso, utf in ISO_UTF: for iso, utf in ISO_UTF:
text = text.replace(iso, utf) text = text.replace(iso, utf)
for char in UNKNOWN_ISO:
lastend: int = 0
for match in finditer("(" + char + ")+", text):
context: str = unknown_chars_context(text[lastend:], char)
logging.warn(
f"Unknown char {char} found in {self.titre[:40]} at: {context}"
)
lastend = match.end()
text = self.translate(text) text = self.translate(text)
else: else:
return "" return ""