more logging, some links still don’t convert

This commit is contained in:
Guilhem Fauré 2023-05-30 15:22:39 +02:00
parent 27c281db90
commit 2ba94d03a8
4 changed files with 68 additions and 43 deletions

View File

@ -1,6 +1,8 @@
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
import logging
import sys
from os import makedirs
from os import makedirs, remove
from os.path import isfile
from shutil import rmtree
from spip2md.config import CFG
@ -27,6 +29,18 @@ def count_output(
return (branches, leaves)
# Clear the previous log file if needed
if CFG.clear_log and isfile(CFG.logfile):
remove(CFG.logfile)
# Configure logging
logging.basicConfig(
format="%(levelname)s:%(message)s",
filename=CFG.logfile,
encoding="utf-8",
level=CFG.loglevel,
)
# Connect to the MySQL database with Peewee ORM
DB.init(CFG.db, host=CFG.db_host, user=CFG.db_user, password=CFG.db_pass)
DB.connect()
@ -64,4 +78,4 @@ stored into {esc(BOLD)}{branches}{esc()} directories"""
)
# Warn about issued warnings in log file
print(f"\nThere might be warnings in {esc(BOLD)}{CFG.logfile}{esc()}")
print(f"\nThere might be warnings and infos in {esc(BOLD)}{CFG.logfile}{esc()}")

View File

@ -25,9 +25,10 @@ class Configuration:
clear_log: bool = True
prepend_h1: bool = True
export_filetype: str = "md"
max_articles_export: int = 1000 # TODO reimplement with recursion
max_sections_export: int = 500 # TODO reimplement with recursion
logfile: str = "spip2md.log"
loglevel: str = "INFO"
# max_articles_export: int = 1000 # TODO reimplement with recursion
# max_sections_export: int = 500 # TODO reimplement with recursion
def __init__(self, config_file: Optional[str] = None):
if config_file is not None:

View File

@ -137,6 +137,17 @@ ARTICLE_LINK = (
),
) # Name and path can be further replaced with .format()
SECTION_LINK = (
( # SPIP style documents & embeds links
compile(r"<()(?:rub|rubrique)([0-9]+)(?:\|(.*?))?>", S | I),
r"[{}]({})",
),
( # Markdown style internal links
compile(r"\[(.*?)\]\((?:rub|rubrique)([0-9]+)(?:\|(.*?))?\)", S | I),
r"[\1{}]({})",
),
) # Name and path can be further replaced with .format()
# Multi language block, to be further processed per lang
MULTILANG_BLOCK = compile(r"<multi>(.+?)<\/multi>", S | I)
MULTILANGS = compile(

View File

@ -1,12 +1,12 @@
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
import logging
from os import makedirs, remove
from os import makedirs
from os.path import basename, splitext
from re import finditer, search
from shutil import copyfile
from typing import Any, Match, Optional
from peewee import BigAutoField, DateTimeField, ModelSelect
from peewee import BigAutoField, DateTimeField, DoesNotExist, ModelSelect
from slugify import slugify
from yaml import dump
@ -27,6 +27,7 @@ from spip2md.regexmap import (
ISO_UTF,
MULTILANG_BLOCK,
MULTILANGS,
SECTION_LINK,
SPECIAL_OUTPUT,
SPIP_MARKDOWN,
UNKNOWN_ISO,
@ -34,12 +35,6 @@ from spip2md.regexmap import (
)
from spip2md.style import BLUE, BOLD, GREEN, WARNING_STYLE, YELLOW, esc
# Clear the previous log file if needed
if CFG.clear_log:
remove(CFG.logfile)
# Output logs to logfile
logging.basicConfig(filename=CFG.logfile, encoding="utf-8")
class SpipWritable:
term_color: int
@ -63,10 +58,10 @@ class SpipWritable:
# Outputs the first lang associated text
first_lang = lang.group(2)
else:
pass
title: str = first_lang[:40].strip(" \n")
translate: str = lang.group(2)[:40].strip(" \n")
logging.warning(
f"Ignored {lang.group(1)} translation of {first_lang[:40]}: "
+ lang.group(2)[:40],
f"Ignored {lang.group(1)} translation of {title}: {translate}",
)
return first_lang
@ -215,38 +210,44 @@ class SpipObject(SpipWritable):
extra: str
def convert(self, text: Optional[str], clean_html: bool = True) -> str:
def found_replace(path_link: str, doc: Any, text: str, match: Match) -> str:
repl: str = path_link.format(doc.titre, doc.filename())
logging.info(f"Translating link to {repl}")
return text.replace(match.group(), repl)
def not_found_warn(path_link: str, text: str, match: Match) -> str:
logging.warn(f"No object for link {match.group()} in {self.titre}")
return text.replace(match.group(), path_link.format("", "NOT FOUND"))
if text is not None and len(text) > 0:
for id_link, path_link in DOCUMENT_LINK:
for match in id_link.finditer(text):
doc: Document = Document.get(Document.id_document == match.group(2))
if doc is not None:
text = text.replace(
match.group(), path_link.format(doc.titre, doc.filename())
)
else:
logging.warn(
f"No document for link {match.group()} in {self.titre}"
)
text = text.replace(
match.group(), path_link.format("", "NOT FOUND")
logging.info(f"Found document link {match.group()} in {self.titre}")
try:
doc: Document = Document.get(
Document.id_document == match.group(2)
)
text = found_replace(path_link, doc, text, match)
except DoesNotExist:
text = not_found_warn(path_link, text, match)
for id_link, path_link in ARTICLE_LINK:
for match in id_link.finditer(text):
art: Article = Article.get(Article.id_article == match.group(2))
if art is not None:
text = text.replace(
match.group(),
path_link.format(
art.titre, f"{art.dir_slug()}/{art.filename()}"
),
)
else:
logging.warn(
f"No article for link {match.group()} in {self.titre}"
)
text = text.replace(
match.group(), path_link.format("", "NOT FOUND")
logging.info(f"Found article link {match.group()} in {self.titre}")
try:
art: Article = Article.get(Article.id_article == match.group(2))
text = found_replace(path_link, art, text, match)
except DoesNotExist:
text = not_found_warn(path_link, text, match)
for id_link, path_link in SECTION_LINK:
for match in id_link.finditer(text):
logging.info(f"Found section link {match.group()} in {self.titre}")
try:
section: Rubrique = Rubrique.get(
Rubrique.id_rubrique == match.group(2)
)
text = found_replace(path_link, section, text, match)
except DoesNotExist:
text = not_found_warn(path_link, text, match)
else:
return ""
return super().convert(text, clean_html)
@ -480,9 +481,7 @@ class RootRubrique(Rubrique):
# self.object_id = 0
self.profondeur = 0
def write_tree(
self, parent_dir: str, sections_limit: int = 0, articles_limit: int = 0
) -> list[str | list]:
def write_tree(self, parent_dir: str) -> list[str | list]:
# Define dictionary output to diplay
output: list[str | list] = []
# Print starting message