improved architecture, started handling <multi> blocks

This commit is contained in:
Guilhem Fauré 2023-05-26 16:39:48 +02:00
parent 32738a9269
commit 952595b34c
3 changed files with 122 additions and 52 deletions

View File

@ -9,7 +9,7 @@ from peewee import ModelSelect
from spip2md.config import CFG from spip2md.config import CFG
from spip2md.database import DB from spip2md.database import DB
from spip2md.spipobjects import Rubrique from spip2md.spipobjects import RootRubrique, Rubrique
# Define styles # Define styles
BOLD = 1 # Bold BOLD = 1 # Bold
@ -56,6 +56,7 @@ def root_sections(limit: int = 10**3) -> ModelSelect:
.limit(limit) .limit(limit)
) )
r""" r"""
# Print the detected unknown chars in article in their context but highlighted # Print the detected unknown chars in article in their context but highlighted
def warn_unknown_chars(article: Article) -> None: def warn_unknown_chars(article: Article) -> None:
@ -80,12 +81,22 @@ def warn_unknown_chars(article: Article) -> None:
# Print one root section list output correctly # Print one root section list output correctly
# sys.setrecursionlimit(2000) # sys.setrecursionlimit(2000)
def print_output(tree: list[Any], depth: int = 0, indent: str = " ") -> None: def print_output(
tree: list[Any],
indent: str = " ",
depth: int = 0,
branches: int = 1,
leaves: int = 0,
) -> tuple[int, int]:
for sub in tree: for sub in tree:
if type(sub) == list: if type(sub) == list:
print_output(sub, depth + 1) branches, leaves = print_output(
sub, indent, depth + 1, branches + 1, leaves
)
else: else:
leaves += 1
print(indent * depth + sub) print(indent * depth + sub)
return (branches, leaves)
# Connect to the MySQL database with Peewee ORM # Connect to the MySQL database with Peewee ORM
@ -95,32 +106,28 @@ DB.connect()
# Main loop to execute only if script is directly executed # Main loop to execute only if script is directly executed
def main(*argv): def main(*argv):
# Allow main to get args when directly executed
if len(argv) == 0: if len(argv) == 0:
argv = sys.argv argv = sys.argv
# Define max nb of sections to export based on first CLI argument TODO
if len(argv) >= 2: # TODO Define max nb of sections to export based on first CLI argument
sections_export = int(argv[1]) # if len(argv) >= 2:
else: # sections_export = int(argv[1])
sections_export = CFG.max_sections_export
# Define max nb of articles to export based on second CLI argument TODO
# if len(argv) >= 3:
# articles_export = int(argv[2])
# else: # else:
# articles_export = CFG.max_articles_export # sections_export = CFG.max_sections_export
# Clear the output dir & create a new # Clear the output dir & create a new
if CFG.clear_output: if CFG.clear_output:
rmtree(CFG.output_dir, True) rmtree(CFG.output_dir, True)
makedirs(CFG.output_dir, exist_ok=True) makedirs(CFG.output_dir, exist_ok=True)
# Get the first max_sections_export root sections # Get the virtual id=0 section
sections: ModelSelect = root_sections(sections_export) root: Rubrique = RootRubrique()
total: int = len(sections)
# Write each root sections with its subtree # Write everything & print the output human-readably
for i, section in enumerate(sections): sections, articles = print_output(root.write_tree(CFG.output_dir))
print_output(section.write_tree(CFG.output_dir, i, total)) # End, summary message
print() # Break line after exporting the section print(f"Exported a total of {sections} sections, containing {articles} articles")
# print() # Break line between export & unknown characters warning # print() # Break line between export & unknown characters warning
# Warn about each article that contains unknown(s) character(s) # Warn about each article that contains unknown(s) character(s)

View File

@ -112,11 +112,9 @@ SPIP_MARKDOWN = (
DOCUMENT_LINK = r"(!)?\[(.*?)\]\(({})\)" DOCUMENT_LINK = r"(!)?\[(.*?)\]\(({})\)"
DOCUMENT_LINK_REPL = r"\1[\2{}]({})" DOCUMENT_LINK_REPL = r"\1[\2{}]({})"
# Multi language block, capture groups: (lang, text, lang, text, …) # Multi language block, to be further processed per lang
MULTILANG = compile( MULTILANG_BLOCK = compile(r"<multi>(.+?)<\/multi>", S | I)
r"<multi>(?:\s*\[(.{2,6})\]\s*(.*?)\s*)+<\/multi>", MULTILANGS = compile(r"\[([a-zA-Z\-]{2,6})\]\s*(.+?)(?=\[[a-zA-Z\-]{2,6}\]|$)", S | I)
S | I,
)
# WARNING probably useless text in metadata fields, to be removed # WARNING probably useless text in metadata fields, to be removed
BLOAT = ( BLOAT = (

View File

@ -3,7 +3,7 @@ from os import makedirs
from os.path import basename, splitext from os.path import basename, splitext
from re import finditer, sub from re import finditer, sub
from shutil import copyfile from shutil import copyfile
from typing import Any, Optional from typing import Any, Match, Optional
from peewee import BigAutoField, DateTimeField, ModelSelect from peewee import BigAutoField, DateTimeField, ModelSelect
from slugify import slugify from slugify import slugify
@ -24,7 +24,8 @@ from spip2md.regexmap import (
DOCUMENT_LINK_REPL, DOCUMENT_LINK_REPL,
HTMLTAG, HTMLTAG,
ISO_UTF, ISO_UTF,
MULTILANG, MULTILANG_BLOCK,
MULTILANGS,
SPIP_MARKDOWN, SPIP_MARKDOWN,
UNKNOWN_ISO, UNKNOWN_ISO,
) )
@ -35,8 +36,47 @@ class SpipWritable:
texte: str texte: str
lang: str lang: str
titre: str titre: str
descriptif: str
profondeur: int profondeur: int
# Returns the first detected language (& instantiate a new object for the second)
# (currently dont instantiate, just warns)
def translate(self, text: str) -> str:
def replace_lang(match: Match[str]) -> str:
first_lang: str = match.group(1)
# The first group is the inside of <multi></multi> blocks
for i, lang in enumerate(MULTILANGS.finditer(match.group(1))):
if i == 0:
# Redefine this lang to the first one WARNING
self.lang = lang.group(1)
# Outputs the first lang associated text
first_lang = lang.group(2)
else:
pass
# print("Found other language for", first_lang, ":", lang.groups())
return first_lang
return MULTILANG_BLOCK.sub(replace_lang, text)
# Apply different mappings to a text field, like SPIP to Markdown or encoding
def convert(self, text: Optional[str]) -> str:
if text is not None and len(text) > 0:
for spip, markdown in SPIP_MARKDOWN:
text = spip.sub(markdown, text)
for bloat in BLOAT:
text = bloat.sub("", text)
for iso, utf in ISO_UTF:
text = text.replace(iso, utf)
text = self.translate(text)
else:
return ""
return text
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.titre = self.convert(self.titre)
self.descriptif = self.convert(self.descriptif)
def filename(self, date: bool = False) -> str: def filename(self, date: bool = False) -> str:
raise NotImplementedError( raise NotImplementedError(
f"Subclasses need to implement filename(), date: {date}" f"Subclasses need to implement filename(), date: {date}"
@ -59,19 +99,6 @@ class SpipWritable:
output[-1] += "MISSING NAME" output[-1] += "MISSING NAME"
return output return output
# Apply different mappings to text fields, like SPIP to Markdown or encoding
def convert_attrs(self, *attrs: str) -> None:
attrs += "titre", "descriptif"
for attr in attrs:
a = getattr(self, attr)
if len(a) > 0:
for spip, markdown in SPIP_MARKDOWN:
setattr(self, attr, spip.sub(markdown, a))
for bloat in BLOAT:
setattr(self, attr, bloat.sub("", a))
for iso, utf in ISO_UTF:
setattr(self, attr, a.replace(iso, utf))
# Write object to output destination # Write object to output destination
def write(self, parent_dir: str) -> str: def write(self, parent_dir: str) -> str:
raise NotImplementedError( raise NotImplementedError(
@ -107,8 +134,6 @@ class Document(SpipWritable, SpipDocuments):
# Write document to output destination # Write document to output destination
def write(self, parent_dir: str) -> str: def write(self, parent_dir: str) -> str:
# Apply needed conversions
super().convert_attrs()
# Define file source and destination # Define file source and destination
src: str = CFG.data_dir + self.fichier src: str = CFG.data_dir + self.fichier
dest: str = parent_dir + self.filename() dest: str = parent_dir + self.filename()
@ -129,6 +154,7 @@ class SpipObject(SpipWritable):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
# Common fields that need conversions # Common fields that need conversions
self.extra: str = self.convert(self.extra)
self.statut: str = "false" if self.statut == "publie" else "true" self.statut: str = "false" if self.statut == "publie" else "true"
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true" self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
# Define file prefix (needs to be redefined for sections) # Define file prefix (needs to be redefined for sections)
@ -223,13 +249,21 @@ class SpipObject(SpipWritable):
body += "\n\n# EXTRA\n\n" + self.extra body += "\n\n# EXTRA\n\n" + self.extra
return body return body
def convert_attrs(self, *attrs: str) -> None: # Clean remaining HTML tags in attrs
return super().convert_attrs(*attrs, "descriptif", "extra") def clean_html(self, *attrs: str) -> None:
attrs += "titre", "texte", "descriptif", "extra"
for attr in attrs:
a = getattr(self, attr)
if len(a) > 0:
setattr(self, attr, HTMLTAG.sub("", a))
# Write object to output destination # Write object to output destination
def write(self, parent_dir: str) -> str: def write(self, parent_dir: str, clean_html: bool = True) -> str:
# Apply needed conversions # Link articles
super().convert_attrs() self.link_articles()
# Delete remaining HTML tags WARNING
if clean_html:
self.clean_html()
# Define actual export directory # Define actual export directory
directory: str = parent_dir + self.dir_slug() directory: str = parent_dir + self.dir_slug()
# Make a directory for this object if there isnt # Make a directory for this object if there isnt
@ -249,15 +283,14 @@ class Article(SpipObject, SpipArticles):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
# More conversions needed for articles # More conversions needed for articles
self.surtitre: str = self.convert(self.surtitre)
self.soustitre: str = self.convert(self.soustitre)
self.chapo: str = self.convert(self.chapo)
self.ps: str = self.convert(self.ps)
self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false" self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
# ID # ID
self.object_id = self.id_article self.object_id = self.id_article
def convert_attrs(self, *attrs: str) -> None:
return super().convert_attrs(
*attrs, "surtitre", "soustitre", "chapo", "ps", "accepter_forum"
)
def frontmatter(self, append: Optional[dict[str, Any]] = None) -> str: def frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
meta: dict[str, Any] = { meta: dict[str, Any] = {
# Article specific # Article specific
@ -331,7 +364,6 @@ class Rubrique(SpipObject, SpipRubriques):
articles = self.articles() articles = self.articles()
documents = self.documents() documents = self.documents()
# Write this section # Write this section
self.link_articles()
output[-1] += self.end_message(self.write(parent_dir)) output[-1] += self.end_message(self.write(parent_dir))
# Redefine parent_dir for subtree elements # Redefine parent_dir for subtree elements
parent_dir = parent_dir + self.dir_slug() parent_dir = parent_dir + self.dir_slug()
@ -363,3 +395,36 @@ class Rubrique(SpipObject, SpipRubriques):
for i, s in enumerate(child_sections): for i, s in enumerate(child_sections):
output.append(s.write_tree(parent_dir, i, nb)) output.append(s.write_tree(parent_dir, i, nb))
return output return output
class RootRubrique(Rubrique):
class Meta:
table_name: str = "spip_rubriques"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# 0 ID
self.id_rubrique = 0
# self.object_id = 0
def write_tree(
self, parent_dir: str, sections_limit: int = 0, articles_limit: int = 0
) -> list[str | list[Any]]:
# Define dictionary output to diplay
output: list[str | list[Any]] = []
# Starting message
output.append(
f"Begin converting {CFG.db}@{CFG.db_host} db to plain Markdown+YAML files"
)
output.append(f" as db user {CFG.db_user}, into the directory {parent_dir}")
# Get all child section of self
child_sections = (
Rubrique.select()
.where(Rubrique.id_parent == self.id_rubrique)
.order_by(Rubrique.date.desc())
)
nb: int = len(child_sections)
# Do the same for subsections (write their entire subtree)
for i, s in enumerate(child_sections):
output.append(s.write_tree(parent_dir, i, nb))
return output