improved architecture, started handling <multi> blocks

This commit is contained in:
Guilhem Fauré 2023-05-26 16:39:48 +02:00
parent 32738a9269
commit 952595b34c
3 changed files with 122 additions and 52 deletions

View File

@ -9,7 +9,7 @@ from peewee import ModelSelect
from spip2md.config import CFG
from spip2md.database import DB
from spip2md.spipobjects import Rubrique
from spip2md.spipobjects import RootRubrique, Rubrique
# Define styles
BOLD = 1 # Bold
@ -56,6 +56,7 @@ def root_sections(limit: int = 10**3) -> ModelSelect:
.limit(limit)
)
r"""
# Print the detected unknown chars in article in their context but highlighted
def warn_unknown_chars(article: Article) -> None:
@ -80,12 +81,22 @@ def warn_unknown_chars(article: Article) -> None:
# Print one root section list output correctly
# sys.setrecursionlimit(2000)
def print_output(tree: list[Any], depth: int = 0, indent: str = " ") -> None:
def print_output(
tree: list[Any],
indent: str = " ",
depth: int = 0,
branches: int = 1,
leaves: int = 0,
) -> tuple[int, int]:
for sub in tree:
if type(sub) == list:
print_output(sub, depth + 1)
branches, leaves = print_output(
sub, indent, depth + 1, branches + 1, leaves
)
else:
leaves += 1
print(indent * depth + sub)
return (branches, leaves)
# Connect to the MySQL database with Peewee ORM
@ -95,32 +106,28 @@ DB.connect()
# Main loop to execute only if script is directly executed
def main(*argv):
# Allow main to get args when directly executed
if len(argv) == 0:
argv = sys.argv
# Define max nb of sections to export based on first CLI argument TODO
if len(argv) >= 2:
sections_export = int(argv[1])
else:
sections_export = CFG.max_sections_export
# Define max nb of articles to export based on second CLI argument TODO
# if len(argv) >= 3:
# articles_export = int(argv[2])
# TODO Define max nb of sections to export based on first CLI argument
# if len(argv) >= 2:
# sections_export = int(argv[1])
# else:
# articles_export = CFG.max_articles_export
# sections_export = CFG.max_sections_export
# Clear the output dir & create a new
if CFG.clear_output:
rmtree(CFG.output_dir, True)
makedirs(CFG.output_dir, exist_ok=True)
# Get the first max_sections_export root sections
sections: ModelSelect = root_sections(sections_export)
total: int = len(sections)
# Get the virtual id=0 section
root: Rubrique = RootRubrique()
# Write each root sections with its subtree
for i, section in enumerate(sections):
print_output(section.write_tree(CFG.output_dir, i, total))
print() # Break line after exporting the section
# Write everything & print the output human-readably
sections, articles = print_output(root.write_tree(CFG.output_dir))
# End, summary message
print(f"Exported a total of {sections} sections, containing {articles} articles")
# print() # Break line between export & unknown characters warning
# Warn about each article that contains unknown(s) character(s)

View File

@ -112,11 +112,9 @@ SPIP_MARKDOWN = (
DOCUMENT_LINK = r"(!)?\[(.*?)\]\(({})\)"
DOCUMENT_LINK_REPL = r"\1[\2{}]({})"
# Multi language block, capture groups: (lang, text, lang, text, …)
MULTILANG = compile(
r"<multi>(?:\s*\[(.{2,6})\]\s*(.*?)\s*)+<\/multi>",
S | I,
)
# Multi language block, to be further processed per lang
MULTILANG_BLOCK = compile(r"<multi>(.+?)<\/multi>", S | I)
MULTILANGS = compile(r"\[([a-zA-Z\-]{2,6})\]\s*(.+?)(?=\[[a-zA-Z\-]{2,6}\]|$)", S | I)
# WARNING probably useless text in metadata fields, to be removed
BLOAT = (

View File

@ -3,7 +3,7 @@ from os import makedirs
from os.path import basename, splitext
from re import finditer, sub
from shutil import copyfile
from typing import Any, Optional
from typing import Any, Match, Optional
from peewee import BigAutoField, DateTimeField, ModelSelect
from slugify import slugify
@ -24,7 +24,8 @@ from spip2md.regexmap import (
DOCUMENT_LINK_REPL,
HTMLTAG,
ISO_UTF,
MULTILANG,
MULTILANG_BLOCK,
MULTILANGS,
SPIP_MARKDOWN,
UNKNOWN_ISO,
)
@ -35,8 +36,47 @@ class SpipWritable:
texte: str
lang: str
titre: str
descriptif: str
profondeur: int
# Returns the first detected language (& instantiate a new object for the second)
# (currently dont instantiate, just warns)
def translate(self, text: str) -> str:
def replace_lang(match: Match[str]) -> str:
first_lang: str = match.group(1)
# The first group is the inside of <multi></multi> blocks
for i, lang in enumerate(MULTILANGS.finditer(match.group(1))):
if i == 0:
# Redefine this lang to the first one WARNING
self.lang = lang.group(1)
# Outputs the first lang associated text
first_lang = lang.group(2)
else:
pass
# print("Found other language for", first_lang, ":", lang.groups())
return first_lang
return MULTILANG_BLOCK.sub(replace_lang, text)
# Apply different mappings to a text field, like SPIP to Markdown or encoding
def convert(self, text: Optional[str]) -> str:
if text is not None and len(text) > 0:
for spip, markdown in SPIP_MARKDOWN:
text = spip.sub(markdown, text)
for bloat in BLOAT:
text = bloat.sub("", text)
for iso, utf in ISO_UTF:
text = text.replace(iso, utf)
text = self.translate(text)
else:
return ""
return text
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.titre = self.convert(self.titre)
self.descriptif = self.convert(self.descriptif)
def filename(self, date: bool = False) -> str:
raise NotImplementedError(
f"Subclasses need to implement filename(), date: {date}"
@ -59,19 +99,6 @@ class SpipWritable:
output[-1] += "MISSING NAME"
return output
# Apply different mappings to text fields, like SPIP to Markdown or encoding
def convert_attrs(self, *attrs: str) -> None:
attrs += "titre", "descriptif"
for attr in attrs:
a = getattr(self, attr)
if len(a) > 0:
for spip, markdown in SPIP_MARKDOWN:
setattr(self, attr, spip.sub(markdown, a))
for bloat in BLOAT:
setattr(self, attr, bloat.sub("", a))
for iso, utf in ISO_UTF:
setattr(self, attr, a.replace(iso, utf))
# Write object to output destination
def write(self, parent_dir: str) -> str:
raise NotImplementedError(
@ -107,8 +134,6 @@ class Document(SpipWritable, SpipDocuments):
# Write document to output destination
def write(self, parent_dir: str) -> str:
# Apply needed conversions
super().convert_attrs()
# Define file source and destination
src: str = CFG.data_dir + self.fichier
dest: str = parent_dir + self.filename()
@ -129,6 +154,7 @@ class SpipObject(SpipWritable):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Common fields that need conversions
self.extra: str = self.convert(self.extra)
self.statut: str = "false" if self.statut == "publie" else "true"
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
# Define file prefix (needs to be redefined for sections)
@ -223,13 +249,21 @@ class SpipObject(SpipWritable):
body += "\n\n# EXTRA\n\n" + self.extra
return body
def convert_attrs(self, *attrs: str) -> None:
return super().convert_attrs(*attrs, "descriptif", "extra")
# Clean remaining HTML tags in attrs
def clean_html(self, *attrs: str) -> None:
attrs += "titre", "texte", "descriptif", "extra"
for attr in attrs:
a = getattr(self, attr)
if len(a) > 0:
setattr(self, attr, HTMLTAG.sub("", a))
# Write object to output destination
def write(self, parent_dir: str) -> str:
# Apply needed conversions
super().convert_attrs()
def write(self, parent_dir: str, clean_html: bool = True) -> str:
# Link articles
self.link_articles()
# Delete remaining HTML tags WARNING
if clean_html:
self.clean_html()
# Define actual export directory
directory: str = parent_dir + self.dir_slug()
# Make a directory for this object if there isnt
@ -249,15 +283,14 @@ class Article(SpipObject, SpipArticles):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# More conversions needed for articles
self.surtitre: str = self.convert(self.surtitre)
self.soustitre: str = self.convert(self.soustitre)
self.chapo: str = self.convert(self.chapo)
self.ps: str = self.convert(self.ps)
self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
# ID
self.object_id = self.id_article
def convert_attrs(self, *attrs: str) -> None:
return super().convert_attrs(
*attrs, "surtitre", "soustitre", "chapo", "ps", "accepter_forum"
)
def frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
meta: dict[str, Any] = {
# Article specific
@ -331,7 +364,6 @@ class Rubrique(SpipObject, SpipRubriques):
articles = self.articles()
documents = self.documents()
# Write this section
self.link_articles()
output[-1] += self.end_message(self.write(parent_dir))
# Redefine parent_dir for subtree elements
parent_dir = parent_dir + self.dir_slug()
@ -363,3 +395,36 @@ class Rubrique(SpipObject, SpipRubriques):
for i, s in enumerate(child_sections):
output.append(s.write_tree(parent_dir, i, nb))
return output
class RootRubrique(Rubrique):
class Meta:
table_name: str = "spip_rubriques"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# 0 ID
self.id_rubrique = 0
# self.object_id = 0
def write_tree(
self, parent_dir: str, sections_limit: int = 0, articles_limit: int = 0
) -> list[str | list[Any]]:
# Define dictionary output to diplay
output: list[str | list[Any]] = []
# Starting message
output.append(
f"Begin converting {CFG.db}@{CFG.db_host} db to plain Markdown+YAML files"
)
output.append(f" as db user {CFG.db_user}, into the directory {parent_dir}")
# Get all child section of self
child_sections = (
Rubrique.select()
.where(Rubrique.id_parent == self.id_rubrique)
.order_by(Rubrique.date.desc())
)
nb: int = len(child_sections)
# Do the same for subsections (write their entire subtree)
for i, s in enumerate(child_sections):
output.append(s.write_tree(parent_dir, i, nb))
return output