From f06d09d3386be86f61d8c4bcde56d7e82d04bd40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilhem=20Faur=C3=A9?= Date: Thu, 25 May 2023 13:52:00 +0200 Subject: [PATCH] start refactor to make section tree exporting recursive to be able to handle the whole depth of SPIP section tree + code deduplication --- spip2md/config.py | 20 +++-- spip2md/main.py | 181 ++++----------------------------------- spip2md/spipobjects.py | 189 +++++++++++++++++++++++++++-------------- spip2md/styling.py | 46 ++++++++++ 4 files changed, 203 insertions(+), 233 deletions(-) create mode 100644 spip2md/styling.py diff --git a/spip2md/config.py b/spip2md/config.py index c5ad8f9..898b76e 100644 --- a/spip2md/config.py +++ b/spip2md/config.py @@ -1,5 +1,5 @@ # pyright: strict -from os.path import isfile +from os.path import expanduser, isfile from typing import Optional from yaml import Loader, load @@ -18,19 +18,29 @@ class Configuration: db_host: str = "localhost" db_user: str = "spip" db_pass: str = "password" - output_dir: str = "output" - max_articles_export: int = 1000 - max_sections_export: int = 500 - data_dir: str = "data" + output_dir: str = "output/" + data_dir: str = "data/" clear_output: bool = False prepend_h1: bool = True export_filetype: str = "md" + max_articles_export: int = 1000 # TODO reimplement with recursion + max_sections_export: int = 500 # TODO reimplement with recursion def __init__(self, config_file: Optional[str] = None): if config_file is not None: + # Read config from config file with open(config_file) as f: config = load(f.read(), Loader=Loader) + # Assign configuration for each attribute in config file for attr in config: + # If attribute is a dir, ensure that ~ is converted to home path + if type(attr) == "string" and "dir" in attr: + directory = expanduser(config[attr]) + # Ensure that directory ends with a slash + directory = ( + directory if directory.last() == "/" else directory + "/" + ) + setattr(self, attr, directory) setattr(self, attr, config[attr]) diff --git a/spip2md/main.py b/spip2md/main.py index 1f5bc2e..57d0943 100755 --- a/spip2md/main.py +++ b/spip2md/main.py @@ -1,151 +1,30 @@ #!python from os import makedirs -from os.path import expanduser -from shutil import copyfile, rmtree +from shutil import rmtree from sys import argv +from peewee import ModelSelect + from config import config from converters import unknown_chars, unknown_chars_context from database import DB from spipobjects import ( Article, - Document, Rubrique, - get_articles, - get_sections, ) - -# Define styles -BO = 1 # Bold -IT = 3 # Italic -UN = 4 # Underline -# Define colors -R = 91 # Red -G = 92 # Green -Y = 93 # Yellow -B = 94 # Blue -C0 = 95 # Color -C1 = 96 # Color -C2 = 96 # Color +from styling import highlight, style -# Print a stylized string, without trailing newline -def style(string: str, *args: int) -> None: - esc = "\033[" # Terminal escape sequence, needs to be closed by "m" - if len(args) == 0: - params: str = "1;" # Defaults to bold - else: - params: str = "" - for a in args: - params += str(a) + ";" - print(esc + params[:-1] + "m" + string + esc + "0m", end="") - - -# Print a string, highlighting every substring starting at start_stop[x][0] … -def highlight(string: str, *start_stop: tuple[int, int]) -> None: - previous_stop = 0 - for start, stop in start_stop: - print(string[previous_stop:start], end="") - style(string[start:stop], BO, R) - previous_stop = stop - print(string[previous_stop:], end="") - - -# Plural ? -def s(nb: int) -> str: - return "s" if nb > 1 else "" - - -# Indent with spaces -def indent(nb: int = 1) -> None: - for _ in range(nb): - print(" ", end="") - - -# Output information about ongoing export & write section to output destination -def write_section(index: int, total: int, section: Rubrique) -> str: - color = G # Associate sections to green - # Print the name of the exported section & number of remaining sections - style(f"{index + 1}. ", BO) - highlight(section.titre, *unknown_chars(section.titre)) - style(f" {total-index-1}", BO, color) - style(f" section{s(total-index)} left") - # Define the section’s path (directory) & create directory(ies) if needed - sectiondir: str = config.output_dir + "/" + section.slug() - makedirs(sectiondir, exist_ok=True) - # Define the section filename & write the index at that filename - sectionpath: str = sectiondir + "/" + section.filename() - with open(sectionpath, "w") as f: - f.write(section.content()) - # Print export location when finished exporting - style(" -> ", BO, color) - print(sectionpath) - # Return the first "limit" articles of section - return sectiondir - - -# Output information about ongoing export & write article to output destination -def write_article(index: int, total: int, article: Article, sectiondir: str) -> str: - color = Y # Associate articles to yellow - # Print the remaining number of articles to export every 100 articles - if index % 100 == 0: - indent() - print("Exporting", end="") - style(f" {total-index}", BO, color) - print(" SPIP", end="") - style(f" article{s(total-index)}") - print(" to Markdown & YAML files") - # Print the title of the article being exported - style( - f" {index + 1}. " - + ("EMPTY " if len(article.texte) < 1 else "") - + f"{article.lang} " +# Query the DB to retrieve all sections without parent, sorted by publication date +def root_sections(limit: int = 10**3) -> ModelSelect: + return ( + Rubrique.select() + .where(Rubrique.id_parent == 0) + .order_by(Rubrique.date.desc()) + .limit(limit) ) - highlight(article.titre, *unknown_chars(article.titre)) - # Define the full article path & create directory(ies) if needed - articledir: str = sectiondir + "/" + article.slug() - makedirs(articledir, exist_ok=True) - # Define the article filename & write the article at the filename - articlepath: str = articledir + "/" + article.filename() - with open(articlepath, "w") as f: - f.write(article.content()) - # Print export location when finished exporting - style(" -> ", BO, color) - print(articlepath) - return articledir -# Output information about ongoing export & copy document to output destination -def write_document( - index: int, total: int, document: Document, objectdir: str, indent_depth: int = 1 -) -> None: - color = B # Associate documents to blue - if index % 100 == 0: - indent(indent_depth) - print("Exporting", end="") - style(f" {total-index}", BO, color) - style(f" document{s(total-index)}\n") - # Print the name of the file with a counter - indent(indent_depth) - style(f"{index + 1}. {document.media} ") - if len(document.titre) > 0: - highlight(document.titre + " ", *unknown_chars(document.titre)) - style("at ") - print(document.fichier, end="") - # Define document path - documentpath: str = expanduser(config.data_dir + "/" + document.fichier) - # Copy the document from it’s SPIP location to the new location - try: - copyfile(documentpath, objectdir + "/" + document.slug()) - except FileNotFoundError: - style(" -> NOT FOUND!\n", BO, R) - else: - # Print the outputted file’s path when copied the file - style(" ->", BO, color) - print(f" {objectdir}/{document.slug()}") - - -# Return true if an article field contains an unknown character def has_unknown_chars(article: Article) -> bool: if len(unknown_chars_context(article.texte)) > 0: return True @@ -198,39 +77,13 @@ if __name__ == "__main__": # Make a list containing articles where unknown characters are detected unknown_chars_articles: list[Article] = [] - # Get sections with an eventual maximum - sections = get_sections(max_sections_export) - nb_sections_export: int = len(sections) + # Write each root sections with its subtree + for section in root_sections(max_sections_export): + section.write() + print() # Break line after exporting the section - # Loop among sections & export them - for i, section in enumerate(sections): - # Get section’s documents & link them - documents = section.documents() - # Write the section and store its output directory - sectiondir = write_section(i, nb_sections_export, section) - # Loop over section’s related documents (images …) - for i, document in enumerate(documents): - write_document(i, len(documents), document, sectiondir) - # Loop over section’s articles - articles = get_articles(section.id_rubrique, (max_articles_export)) - for i, article in enumerate(articles): - # Get article’s documents & link them - documents = article.documents() - # Write the article and store its output directory - articledir = write_article(i, len(articles), article, sectiondir) - # Add article to unknown_chars_articles if needed - if has_unknown_chars(article): - unknown_chars_articles.append(article) - # Decrement export limit - max_articles_export -= 1 - # Loop over article’s related documents (images …) - for i, document in enumerate(documents): - write_document(i, len(documents), document, articledir, 2) - # Break line when finished exporting the section - print() - - print() # Break line - # Loop through each article that contains an unknown character + print() # Break line between export & unknown characters warning + # Warn about each article that contains unknown(s) character(s) for article in unknown_chars_articles: warn_unknown_chars(article) diff --git a/spip2md/spipobjects.py b/spip2md/spipobjects.py index eb2093e..f64fc2d 100644 --- a/spip2md/spipobjects.py +++ b/spip2md/spipobjects.py @@ -1,13 +1,15 @@ +from os import makedirs from os.path import basename, splitext -from re import I, compile, finditer +from re import finditer +from shutil import copyfile from typing import Any -from peewee import ModelSelect +from peewee import Model, ModelSelect from slugify import slugify from yaml import dump from config import config -from converters import convert, link_document +from converters import convert, link_document, unknown_chars from database import ( SpipArticles, SpipAuteurs, @@ -16,9 +18,50 @@ from database import ( SpipDocumentsLiens, SpipRubriques, ) +from styling import BLUE, BOLD, GREEN, YELLOW, highlight, indent, ss, style -class Document(SpipDocuments): +class SpipWritable: + class Meta: + table_name: str + + term_color: int + texte: str + lang: str + titre: str + + def filename(self, date: bool = False) -> str: + raise NotImplementedError("Subclasses need to implement filename()") + + # Output information about file that will be exported + def begin_message( + self, index: int, limit: int, depth: int = 0, step: int = 100 + ) -> None: + # Print the remaining number of objects to export every step object + if index % step == 0: + indent(depth) + print("Exporting", end="") + style(f" {limit-index}", BOLD, self.term_color) + print(f" element{ss(limit-index)} from", end="") + style(f" {self.Meta.table_name}") + # Print the counter & title of the object being exported + indent(depth) + style(f"{index + 1}. ") + highlight(self.titre, *unknown_chars(self.titre)) + # + ("EMPTY " if len(self.texte) < 1 else "") + # + f"{self.lang} " + + # Write object to output destination + def write(self, export_dir: str) -> None: + raise NotImplementedError("Subclasses need to implement write()") + + # Output information about file that was just exported + def end_message(self, export_dir: str): + style(" -> ", BOLD, self.term_color) + print(export_dir + self.filename()) + + +class Document(SpipWritable, SpipDocuments): class Meta: table_name: str = "spip_documents" @@ -27,17 +70,32 @@ class Document(SpipDocuments): self.titre: str = convert(self.titre, True) self.descriptif: str = convert(self.descriptif, True) self.statut: str = "false" if self.statut == "publie" else "true" + # Terminal output color + self.term_color: int = BLUE - def slug(self, date: bool = False) -> str: + # Get slugified name of this file + def filename(self, date: bool = False) -> str: name_type: tuple[str, str] = splitext(basename(self.fichier)) return ( slugify((self.date_publication + "-" if date else "") + name_type[0]) + name_type[1] ) + # Write document to output destination + def write(self, export_dir: str) -> None: + # Copy the document from it’s SPIP location to the new location + try: + copyfile(config.data_dir + self.fichier, export_dir + self.filename()) + except FileNotFoundError: + raise FileNotFoundError(" -> NOT FOUND!\n") from None -class SpipObject: + +class SpipObject(SpipWritable): id: int + id_trad: int + date: str + maj: str + id_secteur: int def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -48,10 +106,16 @@ class SpipObject: self.statut: str = "false" if self.statut == "publie" else "true" self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true" self.extra: str = convert(self.extra) # Probably unused - # Define file prefix (need to be changed later) + # Define file prefix (needs to be redefined for sections) self.prefix = "index" - def documents(self) -> ModelSelect: + # Convert SPIP style internal links for images & other files into Markdown style + def link_documents(self, documents: ModelSelect) -> None: + for d in documents: + self.texte = link_document(self.texte, d.id_document, d.titre, d.slug()) + + # Output related documents & link them in the text by the way + def documents(self, link_documents: bool = True) -> ModelSelect: documents = ( Document.select() .join( @@ -60,23 +124,44 @@ class SpipObject: ) .where(SpipDocumentsLiens.id_objet == self.id) ) - for d in documents: - self.texte = link_document(self.texte, d.id_document, d.titre, d.slug()) - # Internal (articles) links - self.text = link_articles(self.texte) + if link_documents: + self.link_documents(documents) return documents - def slug(self, date: bool = False) -> str: - return slugify((self.date + "-" if date else "") + self.titre) + # Convert SPIP style internal links for other articles or sections into Markdown + def link_articles(self) -> None: + for match in finditer(r"\[(.*?)]\((?:art|article)([0-9]+)\)", self.texte): + article = Article.get(Article.id_article == match.group(2)) + if len(match.group(1)) > 0: + title: str = match.group(1) + else: + title: str = article.titre + self.texte = self.texte.replace( + match.group(0), f"[{title}]({article.slug()}/{article.filename()})" + ) + # Output related articles + def articles(self) -> ModelSelect: + return ( + Article.select() + .where(Article.id_rubrique == self.id) + .order_by(Article.date.desc()) + # .limit(limit) + ) + + # Get slugified directory of this object + def dir_slug(self, include_date: bool = False, end_slash: bool = True) -> str: + date: str = self.date + "-" if include_date else "" + slash: str = "/" if end_slash else "" + return slugify(date + self.titre) + slash + + # Get filename of this object def filename(self) -> str: return self.prefix + "." + self.lang + "." + config.export_filetype - def frontmatter(self) -> str: - raise NotImplementedError("Subclasses must implement 'frontmatter' method.") - - def common_frontmatter(self) -> dict[str, Any]: - return { + # Get the YAML frontmatter string + def frontmatter(self, append: dict[str, Any] = {}) -> str: + meta: dict[str, Any] = { "lang": self.lang, "translationKey": self.id_trad, "title": self.titre, @@ -88,9 +173,12 @@ class SpipObject: "spip_id_secteur": self.id_secteur, "spip_id": self.id, } + return dump(meta | append, allow_unicode=True) - def body(self) -> str: - body: str = "" + # Get file text content + def content(self) -> str: + # Start the content with frontmatter + body: str = "---\n" + self.frontmatter() + "---" # Add the title as a Markdown h1 if len(self.titre) > 0 and config.prepend_h1: body += "\n\n# " + self.titre @@ -103,9 +191,10 @@ class SpipObject: body += "\n\n# EXTRA\n\n" + self.extra return body - def content(self) -> str: - # Return the final article text - return "---\n" + self.frontmatter() + "---" + self.body() + # Write object to output destination + def write(self, export_dir: str) -> None: + with open(export_dir + self.filename(), "w") as f: + f.write(self.content()) class Article(SpipObject, SpipArticles): @@ -122,11 +211,12 @@ class Article(SpipObject, SpipArticles): self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false" # ID self.id = self.id_article + # Terminal output color + self.term_color = YELLOW - def frontmatter(self) -> str: - return dump( + def frontmatter(self, append: dict[str, Any] = {}) -> str: + return super().frontmatter( { - **super().common_frontmatter(), # Article specific "summary": self.chapo, "surtitle": self.surtitre, @@ -135,12 +225,11 @@ class Article(SpipObject, SpipArticles): "authors": [author.nom for author in self.authors()], # Debugging "spip_id_rubrique": self.id_rubrique, - }, - allow_unicode=True, + } ) - def body(self) -> str: - body: str = super().body() + def content(self) -> str: + body: str = super().content() # If there is a caption, add the caption followed by a hr if len(self.chapo) > 0: body += "\n\n" + self.chapo + "\n\n***" @@ -163,29 +252,6 @@ class Article(SpipObject, SpipArticles): ) -# Query the DB to retrieve all articles sorted by publication date -def get_articles(section_id: int, limit: int = 10**6) -> ModelSelect: - return ( - Article.select() - .where(Article.id_rubrique == section_id) - .order_by(Article.date.desc()) - .limit(limit) - ) - - -def link_articles(text: str): - for match in finditer(r"\[(.*?)]\((?:art|article)([0-9]+)\)", text): - article = Article.get(Article.id_article == match.group(2)) - if len(match.group(1)) > 0: - title: str = match.group(1) - else: - title: str = article.titre - text = text.replace( - match.group(0), f"[{title}]({article.slug()}/{article.filename()})" - ) - return text - - class Rubrique(SpipObject, SpipRubriques): class Meta: table_name: str = "spip_rubriques" @@ -196,19 +262,14 @@ class Rubrique(SpipObject, SpipRubriques): self.id = self.id_rubrique # File prefix self.prefix = "_index" + # Terminal output color + self.term_color = GREEN - def frontmatter(self) -> str: - return dump( + def frontmatter(self, append: dict[str, Any] = {}) -> str: + return super().frontmatter( { - **super().common_frontmatter(), # Debugging "spip_id_parent": self.id_parent, "spip_profondeur": self.profondeur, - }, - allow_unicode=True, + } ) - - -# Query the DB to retrieve all sections sorted by publication date -def get_sections(limit: int = 10**6) -> ModelSelect: - return Rubrique.select().order_by(Rubrique.date.desc()).limit(limit) diff --git a/spip2md/styling.py b/spip2md/styling.py new file mode 100644 index 0000000..a1ab23d --- /dev/null +++ b/spip2md/styling.py @@ -0,0 +1,46 @@ +# pyright: strict +# Define styles +BOLD = 1 # Bold +ITALIC = 3 # Italic +UNDER = 4 # Underline +# Define colors +RED = 91 # Red +GREEN = 92 # Green +YELLOW = 93 # Yellow +BLUE = 94 # Blue +C0 = 95 # Color +C1 = 96 # Color +C2 = 96 # Color + + +# Print a stylized string, without trailing newline +def style(string: str, *args: int, end: str = "") -> None: + esc = "\033[" # Terminal escape sequence, needs to be closed by "m" + if len(args) == 0: + params: str = "1;" # Defaults to bold + else: + params: str = "" + for a in args: + params += str(a) + ";" + print(esc + params[:-1] + "m" + string + esc + "0m", end=end) + + +# Print a string, highlighting every substring starting at start_stop[x][0] … +def highlight(string: str, *start_stop: tuple[int, int], end: str = "") -> None: + previous_stop = 0 + for start, stop in start_stop: + print(string[previous_stop:start], end="") + style(string[start:stop], BOLD, RED) + previous_stop = stop + print(string[previous_stop:], end=end) + + +# Plural ? +def ss(nb: int) -> str: + return "s" if nb > 1 else "" + + +# Indent with 2 spaces +def indent(nb: int = 1) -> None: + for _ in range(nb): + print(" ", end="")