improved architecture, started handling <multi> blocks
This commit is contained in:
parent
32738a9269
commit
952595b34c
@ -9,7 +9,7 @@ from peewee import ModelSelect
|
|||||||
|
|
||||||
from spip2md.config import CFG
|
from spip2md.config import CFG
|
||||||
from spip2md.database import DB
|
from spip2md.database import DB
|
||||||
from spip2md.spipobjects import Rubrique
|
from spip2md.spipobjects import RootRubrique, Rubrique
|
||||||
|
|
||||||
# Define styles
|
# Define styles
|
||||||
BOLD = 1 # Bold
|
BOLD = 1 # Bold
|
||||||
@ -56,6 +56,7 @@ def root_sections(limit: int = 10**3) -> ModelSelect:
|
|||||||
.limit(limit)
|
.limit(limit)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
r"""
|
r"""
|
||||||
# Print the detected unknown chars in article in their context but highlighted
|
# Print the detected unknown chars in article in their context but highlighted
|
||||||
def warn_unknown_chars(article: Article) -> None:
|
def warn_unknown_chars(article: Article) -> None:
|
||||||
@ -80,12 +81,22 @@ def warn_unknown_chars(article: Article) -> None:
|
|||||||
|
|
||||||
# Print one root section list output correctly
|
# Print one root section list output correctly
|
||||||
# sys.setrecursionlimit(2000)
|
# sys.setrecursionlimit(2000)
|
||||||
def print_output(tree: list[Any], depth: int = 0, indent: str = " ") -> None:
|
def print_output(
|
||||||
|
tree: list[Any],
|
||||||
|
indent: str = " ",
|
||||||
|
depth: int = 0,
|
||||||
|
branches: int = 1,
|
||||||
|
leaves: int = 0,
|
||||||
|
) -> tuple[int, int]:
|
||||||
for sub in tree:
|
for sub in tree:
|
||||||
if type(sub) == list:
|
if type(sub) == list:
|
||||||
print_output(sub, depth + 1)
|
branches, leaves = print_output(
|
||||||
|
sub, indent, depth + 1, branches + 1, leaves
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
|
leaves += 1
|
||||||
print(indent * depth + sub)
|
print(indent * depth + sub)
|
||||||
|
return (branches, leaves)
|
||||||
|
|
||||||
|
|
||||||
# Connect to the MySQL database with Peewee ORM
|
# Connect to the MySQL database with Peewee ORM
|
||||||
@ -95,32 +106,28 @@ DB.connect()
|
|||||||
|
|
||||||
# Main loop to execute only if script is directly executed
|
# Main loop to execute only if script is directly executed
|
||||||
def main(*argv):
|
def main(*argv):
|
||||||
|
# Allow main to get args when directly executed
|
||||||
if len(argv) == 0:
|
if len(argv) == 0:
|
||||||
argv = sys.argv
|
argv = sys.argv
|
||||||
# Define max nb of sections to export based on first CLI argument TODO
|
|
||||||
if len(argv) >= 2:
|
# TODO Define max nb of sections to export based on first CLI argument
|
||||||
sections_export = int(argv[1])
|
# if len(argv) >= 2:
|
||||||
else:
|
# sections_export = int(argv[1])
|
||||||
sections_export = CFG.max_sections_export
|
|
||||||
# Define max nb of articles to export based on second CLI argument TODO
|
|
||||||
# if len(argv) >= 3:
|
|
||||||
# articles_export = int(argv[2])
|
|
||||||
# else:
|
# else:
|
||||||
# articles_export = CFG.max_articles_export
|
# sections_export = CFG.max_sections_export
|
||||||
|
|
||||||
# Clear the output dir & create a new
|
# Clear the output dir & create a new
|
||||||
if CFG.clear_output:
|
if CFG.clear_output:
|
||||||
rmtree(CFG.output_dir, True)
|
rmtree(CFG.output_dir, True)
|
||||||
makedirs(CFG.output_dir, exist_ok=True)
|
makedirs(CFG.output_dir, exist_ok=True)
|
||||||
|
|
||||||
# Get the first max_sections_export root sections
|
# Get the virtual id=0 section
|
||||||
sections: ModelSelect = root_sections(sections_export)
|
root: Rubrique = RootRubrique()
|
||||||
total: int = len(sections)
|
|
||||||
|
|
||||||
# Write each root sections with its subtree
|
# Write everything & print the output human-readably
|
||||||
for i, section in enumerate(sections):
|
sections, articles = print_output(root.write_tree(CFG.output_dir))
|
||||||
print_output(section.write_tree(CFG.output_dir, i, total))
|
# End, summary message
|
||||||
print() # Break line after exporting the section
|
print(f"Exported a total of {sections} sections, containing {articles} articles")
|
||||||
|
|
||||||
# print() # Break line between export & unknown characters warning
|
# print() # Break line between export & unknown characters warning
|
||||||
# Warn about each article that contains unknown(s) character(s)
|
# Warn about each article that contains unknown(s) character(s)
|
||||||
|
@ -112,11 +112,9 @@ SPIP_MARKDOWN = (
|
|||||||
DOCUMENT_LINK = r"(!)?\[(.*?)\]\(({})\)"
|
DOCUMENT_LINK = r"(!)?\[(.*?)\]\(({})\)"
|
||||||
DOCUMENT_LINK_REPL = r"\1[\2{}]({})"
|
DOCUMENT_LINK_REPL = r"\1[\2{}]({})"
|
||||||
|
|
||||||
# Multi language block, capture groups: (lang, text, lang, text, …)
|
# Multi language block, to be further processed per lang
|
||||||
MULTILANG = compile(
|
MULTILANG_BLOCK = compile(r"<multi>(.+?)<\/multi>", S | I)
|
||||||
r"<multi>(?:\s*\[(.{2,6})\]\s*(.*?)\s*)+<\/multi>",
|
MULTILANGS = compile(r"\[([a-zA-Z\-]{2,6})\]\s*(.+?)(?=\[[a-zA-Z\-]{2,6}\]|$)", S | I)
|
||||||
S | I,
|
|
||||||
)
|
|
||||||
|
|
||||||
# WARNING probably useless text in metadata fields, to be removed
|
# WARNING probably useless text in metadata fields, to be removed
|
||||||
BLOAT = (
|
BLOAT = (
|
||||||
|
@ -3,7 +3,7 @@ from os import makedirs
|
|||||||
from os.path import basename, splitext
|
from os.path import basename, splitext
|
||||||
from re import finditer, sub
|
from re import finditer, sub
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
from typing import Any, Optional
|
from typing import Any, Match, Optional
|
||||||
|
|
||||||
from peewee import BigAutoField, DateTimeField, ModelSelect
|
from peewee import BigAutoField, DateTimeField, ModelSelect
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
@ -24,7 +24,8 @@ from spip2md.regexmap import (
|
|||||||
DOCUMENT_LINK_REPL,
|
DOCUMENT_LINK_REPL,
|
||||||
HTMLTAG,
|
HTMLTAG,
|
||||||
ISO_UTF,
|
ISO_UTF,
|
||||||
MULTILANG,
|
MULTILANG_BLOCK,
|
||||||
|
MULTILANGS,
|
||||||
SPIP_MARKDOWN,
|
SPIP_MARKDOWN,
|
||||||
UNKNOWN_ISO,
|
UNKNOWN_ISO,
|
||||||
)
|
)
|
||||||
@ -35,8 +36,47 @@ class SpipWritable:
|
|||||||
texte: str
|
texte: str
|
||||||
lang: str
|
lang: str
|
||||||
titre: str
|
titre: str
|
||||||
|
descriptif: str
|
||||||
profondeur: int
|
profondeur: int
|
||||||
|
|
||||||
|
# Returns the first detected language (& instantiate a new object for the second)
|
||||||
|
# (currently don’t instantiate, just warns)
|
||||||
|
def translate(self, text: str) -> str:
|
||||||
|
def replace_lang(match: Match[str]) -> str:
|
||||||
|
first_lang: str = match.group(1)
|
||||||
|
# The first group is the inside of <multi></multi> blocks
|
||||||
|
for i, lang in enumerate(MULTILANGS.finditer(match.group(1))):
|
||||||
|
if i == 0:
|
||||||
|
# Redefine this lang to the first one WARNING
|
||||||
|
self.lang = lang.group(1)
|
||||||
|
# Outputs the first lang associated text
|
||||||
|
first_lang = lang.group(2)
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
# print("Found other language for", first_lang, ":", lang.groups())
|
||||||
|
return first_lang
|
||||||
|
|
||||||
|
return MULTILANG_BLOCK.sub(replace_lang, text)
|
||||||
|
|
||||||
|
# Apply different mappings to a text field, like SPIP to Markdown or encoding
|
||||||
|
def convert(self, text: Optional[str]) -> str:
|
||||||
|
if text is not None and len(text) > 0:
|
||||||
|
for spip, markdown in SPIP_MARKDOWN:
|
||||||
|
text = spip.sub(markdown, text)
|
||||||
|
for bloat in BLOAT:
|
||||||
|
text = bloat.sub("", text)
|
||||||
|
for iso, utf in ISO_UTF:
|
||||||
|
text = text.replace(iso, utf)
|
||||||
|
text = self.translate(text)
|
||||||
|
else:
|
||||||
|
return ""
|
||||||
|
return text
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.titre = self.convert(self.titre)
|
||||||
|
self.descriptif = self.convert(self.descriptif)
|
||||||
|
|
||||||
def filename(self, date: bool = False) -> str:
|
def filename(self, date: bool = False) -> str:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
f"Subclasses need to implement filename(), date: {date}"
|
f"Subclasses need to implement filename(), date: {date}"
|
||||||
@ -59,19 +99,6 @@ class SpipWritable:
|
|||||||
output[-1] += "MISSING NAME"
|
output[-1] += "MISSING NAME"
|
||||||
return output
|
return output
|
||||||
|
|
||||||
# Apply different mappings to text fields, like SPIP to Markdown or encoding
|
|
||||||
def convert_attrs(self, *attrs: str) -> None:
|
|
||||||
attrs += "titre", "descriptif"
|
|
||||||
for attr in attrs:
|
|
||||||
a = getattr(self, attr)
|
|
||||||
if len(a) > 0:
|
|
||||||
for spip, markdown in SPIP_MARKDOWN:
|
|
||||||
setattr(self, attr, spip.sub(markdown, a))
|
|
||||||
for bloat in BLOAT:
|
|
||||||
setattr(self, attr, bloat.sub("", a))
|
|
||||||
for iso, utf in ISO_UTF:
|
|
||||||
setattr(self, attr, a.replace(iso, utf))
|
|
||||||
|
|
||||||
# Write object to output destination
|
# Write object to output destination
|
||||||
def write(self, parent_dir: str) -> str:
|
def write(self, parent_dir: str) -> str:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
@ -107,8 +134,6 @@ class Document(SpipWritable, SpipDocuments):
|
|||||||
|
|
||||||
# Write document to output destination
|
# Write document to output destination
|
||||||
def write(self, parent_dir: str) -> str:
|
def write(self, parent_dir: str) -> str:
|
||||||
# Apply needed conversions
|
|
||||||
super().convert_attrs()
|
|
||||||
# Define file source and destination
|
# Define file source and destination
|
||||||
src: str = CFG.data_dir + self.fichier
|
src: str = CFG.data_dir + self.fichier
|
||||||
dest: str = parent_dir + self.filename()
|
dest: str = parent_dir + self.filename()
|
||||||
@ -129,6 +154,7 @@ class SpipObject(SpipWritable):
|
|||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
# Common fields that need conversions
|
# Common fields that need conversions
|
||||||
|
self.extra: str = self.convert(self.extra)
|
||||||
self.statut: str = "false" if self.statut == "publie" else "true"
|
self.statut: str = "false" if self.statut == "publie" else "true"
|
||||||
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
|
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
|
||||||
# Define file prefix (needs to be redefined for sections)
|
# Define file prefix (needs to be redefined for sections)
|
||||||
@ -223,13 +249,21 @@ class SpipObject(SpipWritable):
|
|||||||
body += "\n\n# EXTRA\n\n" + self.extra
|
body += "\n\n# EXTRA\n\n" + self.extra
|
||||||
return body
|
return body
|
||||||
|
|
||||||
def convert_attrs(self, *attrs: str) -> None:
|
# Clean remaining HTML tags in attrs
|
||||||
return super().convert_attrs(*attrs, "descriptif", "extra")
|
def clean_html(self, *attrs: str) -> None:
|
||||||
|
attrs += "titre", "texte", "descriptif", "extra"
|
||||||
|
for attr in attrs:
|
||||||
|
a = getattr(self, attr)
|
||||||
|
if len(a) > 0:
|
||||||
|
setattr(self, attr, HTMLTAG.sub("", a))
|
||||||
|
|
||||||
# Write object to output destination
|
# Write object to output destination
|
||||||
def write(self, parent_dir: str) -> str:
|
def write(self, parent_dir: str, clean_html: bool = True) -> str:
|
||||||
# Apply needed conversions
|
# Link articles
|
||||||
super().convert_attrs()
|
self.link_articles()
|
||||||
|
# Delete remaining HTML tags WARNING
|
||||||
|
if clean_html:
|
||||||
|
self.clean_html()
|
||||||
# Define actual export directory
|
# Define actual export directory
|
||||||
directory: str = parent_dir + self.dir_slug()
|
directory: str = parent_dir + self.dir_slug()
|
||||||
# Make a directory for this object if there isn’t
|
# Make a directory for this object if there isn’t
|
||||||
@ -249,15 +283,14 @@ class Article(SpipObject, SpipArticles):
|
|||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
# More conversions needed for articles
|
# More conversions needed for articles
|
||||||
|
self.surtitre: str = self.convert(self.surtitre)
|
||||||
|
self.soustitre: str = self.convert(self.soustitre)
|
||||||
|
self.chapo: str = self.convert(self.chapo)
|
||||||
|
self.ps: str = self.convert(self.ps)
|
||||||
self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
|
self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
|
||||||
# ID
|
# ID
|
||||||
self.object_id = self.id_article
|
self.object_id = self.id_article
|
||||||
|
|
||||||
def convert_attrs(self, *attrs: str) -> None:
|
|
||||||
return super().convert_attrs(
|
|
||||||
*attrs, "surtitre", "soustitre", "chapo", "ps", "accepter_forum"
|
|
||||||
)
|
|
||||||
|
|
||||||
def frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
|
def frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
|
||||||
meta: dict[str, Any] = {
|
meta: dict[str, Any] = {
|
||||||
# Article specific
|
# Article specific
|
||||||
@ -331,7 +364,6 @@ class Rubrique(SpipObject, SpipRubriques):
|
|||||||
articles = self.articles()
|
articles = self.articles()
|
||||||
documents = self.documents()
|
documents = self.documents()
|
||||||
# Write this section
|
# Write this section
|
||||||
self.link_articles()
|
|
||||||
output[-1] += self.end_message(self.write(parent_dir))
|
output[-1] += self.end_message(self.write(parent_dir))
|
||||||
# Redefine parent_dir for subtree elements
|
# Redefine parent_dir for subtree elements
|
||||||
parent_dir = parent_dir + self.dir_slug()
|
parent_dir = parent_dir + self.dir_slug()
|
||||||
@ -363,3 +395,36 @@ class Rubrique(SpipObject, SpipRubriques):
|
|||||||
for i, s in enumerate(child_sections):
|
for i, s in enumerate(child_sections):
|
||||||
output.append(s.write_tree(parent_dir, i, nb))
|
output.append(s.write_tree(parent_dir, i, nb))
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class RootRubrique(Rubrique):
|
||||||
|
class Meta:
|
||||||
|
table_name: str = "spip_rubriques"
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
# 0 ID
|
||||||
|
self.id_rubrique = 0
|
||||||
|
# self.object_id = 0
|
||||||
|
|
||||||
|
def write_tree(
|
||||||
|
self, parent_dir: str, sections_limit: int = 0, articles_limit: int = 0
|
||||||
|
) -> list[str | list[Any]]:
|
||||||
|
# Define dictionary output to diplay
|
||||||
|
output: list[str | list[Any]] = []
|
||||||
|
# Starting message
|
||||||
|
output.append(
|
||||||
|
f"Begin converting {CFG.db}@{CFG.db_host} db to plain Markdown+YAML files"
|
||||||
|
)
|
||||||
|
output.append(f" as db user {CFG.db_user}, into the directory {parent_dir}")
|
||||||
|
# Get all child section of self
|
||||||
|
child_sections = (
|
||||||
|
Rubrique.select()
|
||||||
|
.where(Rubrique.id_parent == self.id_rubrique)
|
||||||
|
.order_by(Rubrique.date.desc())
|
||||||
|
)
|
||||||
|
nb: int = len(child_sections)
|
||||||
|
# Do the same for subsections (write their entire subtree)
|
||||||
|
for i, s in enumerate(child_sections):
|
||||||
|
output.append(s.write_tree(parent_dir, i, nb))
|
||||||
|
return output
|
||||||
|
Loading…
Reference in New Issue
Block a user