start refactor to make section tree exporting recursive to be able to handle the whole depth of SPIP section tree + code deduplication

2023-05-25 13:52:00 +02:00 · 2023-05-25 13:52:00 +02:00 · f06d09d338
commit f06d09d338
parent 7e3680d282
4 changed files with 203 additions and 233 deletions
--- a/spip2md/config.py
+++ b/spip2md/config.py
@ -1,5 +1,5 @@
 # pyright: strict
-from os.path import isfile
+from os.path import expanduser, isfile
 from typing import Optional

 from yaml import Loader, load
@ -18,19 +18,29 @@ class Configuration:
    db_host: str = "localhost"
    db_user: str = "spip"
    db_pass: str = "password"
-    output_dir: str = "output"
-    max_articles_export: int = 1000
-    max_sections_export: int = 500
-    data_dir: str = "data"
+    output_dir: str = "output/"
+    data_dir: str = "data/"
    clear_output: bool = False
    prepend_h1: bool = True
    export_filetype: str = "md"
+    max_articles_export: int = 1000  # TODO reimplement with recursion
+    max_sections_export: int = 500  # TODO reimplement with recursion

    def __init__(self, config_file: Optional[str] = None):
        if config_file is not None:
+            # Read config from config file
            with open(config_file) as f:
                config = load(f.read(), Loader=Loader)
+            # Assign configuration for each attribute in config file
            for attr in config:
+                # If attribute is a dir, ensure that ~ is converted to home path
+                if type(attr) == "string" and "dir" in attr:
+                    directory = expanduser(config[attr])
+                    # Ensure that directory ends with a slash
+                    directory = (
+                        directory if directory.last() == "/" else directory + "/"
+                    )
+                    setattr(self, attr, directory)
                setattr(self, attr, config[attr])


--- a/spip2md/main.py
+++ b/spip2md/main.py
@ -1,151 +1,30 @@
 #!python
 from os import makedirs
-from os.path import expanduser
-from shutil import copyfile, rmtree
+from shutil import rmtree
 from sys import argv

+from peewee import ModelSelect
+
 from config import config
 from converters import unknown_chars, unknown_chars_context
 from database import DB
 from spipobjects import (
    Article,
-    Document,
    Rubrique,
-    get_articles,
-    get_sections,
 )
-
-# Define styles
-BO = 1  # Bold
-IT = 3  # Italic
-UN = 4  # Underline
-# Define colors
-R = 91  # Red
-G = 92  # Green
-Y = 93  # Yellow
-B = 94  # Blue
-C0 = 95  # Color
-C1 = 96  # Color
-C2 = 96  # Color
+from styling import highlight, style


-# Print a stylized string, without trailing newline
-def style(string: str, *args: int) -> None:
-    esc = "\033["  # Terminal escape sequence, needs to be closed by "m"
-    if len(args) == 0:
-        params: str = "1;"  # Defaults to bold
-    else:
-        params: str = ""
-    for a in args:
-        params += str(a) + ";"
-    print(esc + params[:-1] + "m" + string + esc + "0m", end="")
-
-
-# Print a string, highlighting every substring starting at start_stop[x][0] …
-def highlight(string: str, *start_stop: tuple[int, int]) -> None:
-    previous_stop = 0
-    for start, stop in start_stop:
-        print(string[previous_stop:start], end="")
-        style(string[start:stop], BO, R)
-        previous_stop = stop
-    print(string[previous_stop:], end="")
-
-
-# Plural ?
-def s(nb: int) -> str:
-    return "s" if nb > 1 else ""
-
-
-# Indent with spaces
-def indent(nb: int = 1) -> None:
-    for _ in range(nb):
-        print("  ", end="")
-
-
-# Output information about ongoing export & write section to output destination
-def write_section(index: int, total: int, section: Rubrique) -> str:
-    color = G  # Associate sections to green
-    # Print the name of the exported section & number of remaining sections
-    style(f"{index + 1}. ", BO)
-    highlight(section.titre, *unknown_chars(section.titre))
-    style(f" {total-index-1}", BO, color)
-    style(f" section{s(total-index)} left")
-    # Define the section’s path (directory) & create directory(ies) if needed
-    sectiondir: str = config.output_dir + "/" + section.slug()
-    makedirs(sectiondir, exist_ok=True)
-    # Define the section filename & write the index at that filename
-    sectionpath: str = sectiondir + "/" + section.filename()
-    with open(sectionpath, "w") as f:
-        f.write(section.content())
-    # Print export location when finished exporting
-    style(" -> ", BO, color)
-    print(sectionpath)
-    # Return the first "limit" articles of section
-    return sectiondir
-
-
-# Output information about ongoing export & write article to output destination
-def write_article(index: int, total: int, article: Article, sectiondir: str) -> str:
-    color = Y  # Associate articles to yellow
-    # Print the remaining number of articles to export every 100 articles
-    if index % 100 == 0:
-        indent()
-        print("Exporting", end="")
-        style(f" {total-index}", BO, color)
-        print(" SPIP", end="")
-        style(f" article{s(total-index)}")
-        print(" to Markdown & YAML files")
-    # Print the title of the article being exported
-    style(
-        f"  {index + 1}. "
-        + ("EMPTY " if len(article.texte) < 1 else "")
-        + f"{article.lang} "
+# Query the DB to retrieve all sections without parent, sorted by publication date
+def root_sections(limit: int = 10**3) -> ModelSelect:
+    return (
+        Rubrique.select()
+        .where(Rubrique.id_parent == 0)
+        .order_by(Rubrique.date.desc())
+        .limit(limit)
    )
-    highlight(article.titre, *unknown_chars(article.titre))
-    # Define the full article path & create directory(ies) if needed
-    articledir: str = sectiondir + "/" + article.slug()
-    makedirs(articledir, exist_ok=True)
-    # Define the article filename & write the article at the filename
-    articlepath: str = articledir + "/" + article.filename()
-    with open(articlepath, "w") as f:
-        f.write(article.content())
-    # Print export location when finished exporting
-    style(" -> ", BO, color)
-    print(articlepath)
-    return articledir


-# Output information about ongoing export & copy document to output destination
-def write_document(
-    index: int, total: int, document: Document, objectdir: str, indent_depth: int = 1
-) -> None:
-    color = B  # Associate documents to blue
-    if index % 100 == 0:
-        indent(indent_depth)
-        print("Exporting", end="")
-        style(f" {total-index}", BO, color)
-        style(f" document{s(total-index)}\n")
-    # Print the name of the file with a counter
-    indent(indent_depth)
-    style(f"{index + 1}. {document.media} ")
-    if len(document.titre) > 0:
-        highlight(document.titre + " ", *unknown_chars(document.titre))
-    style("at ")
-    print(document.fichier, end="")
-    # Define document path
-    documentpath: str = expanduser(config.data_dir + "/" + document.fichier)
-    # Copy the document from it’s SPIP location to the new location
-    try:
-        copyfile(documentpath, objectdir + "/" + document.slug())
-    except FileNotFoundError:
-        style(" -> NOT FOUND!\n", BO, R)
-    else:
-        # Print the outputted file’s path when copied the file
-        style(" ->", BO, color)
-        print(f" {objectdir}/{document.slug()}")
-
-
-# Return true if an article field contains an unknown character
 def has_unknown_chars(article: Article) -> bool:
    if len(unknown_chars_context(article.texte)) > 0:
        return True
@ -198,39 +77,13 @@ if __name__ == "__main__":
    # Make a list containing articles where unknown characters are detected
    unknown_chars_articles: list[Article] = []

-    # Get sections with an eventual maximum
-    sections = get_sections(max_sections_export)
-    nb_sections_export: int = len(sections)
+    # Write each root sections with its subtree
+    for section in root_sections(max_sections_export):
+        section.write()
+        print()  # Break line after exporting the section

-    # Loop among sections & export them
-    for i, section in enumerate(sections):
-        # Get section’s documents & link them
-        documents = section.documents()
-        # Write the section and store its output directory
-        sectiondir = write_section(i, nb_sections_export, section)
-        # Loop over section’s related documents (images …)
-        for i, document in enumerate(documents):
-            write_document(i, len(documents), document, sectiondir)
-        # Loop over section’s articles
-        articles = get_articles(section.id_rubrique, (max_articles_export))
-        for i, article in enumerate(articles):
-            # Get article’s documents & link them
-            documents = article.documents()
-            # Write the article and store its output directory
-            articledir = write_article(i, len(articles), article, sectiondir)
-            # Add article to unknown_chars_articles if needed
-            if has_unknown_chars(article):
-                unknown_chars_articles.append(article)
-            # Decrement export limit
-            max_articles_export -= 1
-            # Loop over article’s related documents (images …)
-            for i, document in enumerate(documents):
-                write_document(i, len(documents), document, articledir, 2)
-        # Break line when finished exporting the section
-        print()
-
-    print()  # Break line
-    # Loop through each article that contains an unknown character
+    print()  # Break line between export & unknown characters warning
+    # Warn about each article that contains unknown(s) character(s)
    for article in unknown_chars_articles:
        warn_unknown_chars(article)

--- a/spip2md/spipobjects.py
+++ b/spip2md/spipobjects.py
@ -1,13 +1,15 @@
+from os import makedirs
 from os.path import basename, splitext
-from re import I, compile, finditer
+from re import finditer
+from shutil import copyfile
 from typing import Any

-from peewee import ModelSelect
+from peewee import Model, ModelSelect
 from slugify import slugify
 from yaml import dump

 from config import config
-from converters import convert, link_document
+from converters import convert, link_document, unknown_chars
 from database import (
    SpipArticles,
    SpipAuteurs,
@ -16,9 +18,50 @@ from database import (
    SpipDocumentsLiens,
    SpipRubriques,
 )
+from styling import BLUE, BOLD, GREEN, YELLOW, highlight, indent, ss, style


-class Document(SpipDocuments):
+class SpipWritable:
+    class Meta:
+        table_name: str
+
+    term_color: int
+    texte: str
+    lang: str
+    titre: str
+
+    def filename(self, date: bool = False) -> str:
+        raise NotImplementedError("Subclasses need to implement filename()")
+
+    # Output information about file that will be exported
+    def begin_message(
+        self, index: int, limit: int, depth: int = 0, step: int = 100
+    ) -> None:
+        # Print the remaining number of objects to export every step object
+        if index % step == 0:
+            indent(depth)
+            print("Exporting", end="")
+            style(f" {limit-index}", BOLD, self.term_color)
+            print(f" element{ss(limit-index)} from", end="")
+            style(f" {self.Meta.table_name}")
+        # Print the counter & title of the object being exported
+        indent(depth)
+        style(f"{index + 1}. ")
+        highlight(self.titre, *unknown_chars(self.titre))
+        # + ("EMPTY " if len(self.texte) < 1 else "")
+        # + f"{self.lang} "
+
+    # Write object to output destination
+    def write(self, export_dir: str) -> None:
+        raise NotImplementedError("Subclasses need to implement write()")
+
+    # Output information about file that was just exported
+    def end_message(self, export_dir: str):
+        style(" -> ", BOLD, self.term_color)
+        print(export_dir + self.filename())
+
+
+class Document(SpipWritable, SpipDocuments):
    class Meta:
        table_name: str = "spip_documents"

@ -27,17 +70,32 @@ class Document(SpipDocuments):
        self.titre: str = convert(self.titre, True)
        self.descriptif: str = convert(self.descriptif, True)
        self.statut: str = "false" if self.statut == "publie" else "true"
+        # Terminal output color
+        self.term_color: int = BLUE

-    def slug(self, date: bool = False) -> str:
+    # Get slugified name of this file
+    def filename(self, date: bool = False) -> str:
        name_type: tuple[str, str] = splitext(basename(self.fichier))
        return (
            slugify((self.date_publication + "-" if date else "") + name_type[0])
            + name_type[1]
        )

+    # Write document to output destination
+    def write(self, export_dir: str) -> None:
+        # Copy the document from it’s SPIP location to the new location
+        try:
+            copyfile(config.data_dir + self.fichier, export_dir + self.filename())
+        except FileNotFoundError:
+            raise FileNotFoundError(" -> NOT FOUND!\n") from None

-class SpipObject:
+
+class SpipObject(SpipWritable):
    id: int
+    id_trad: int
+    date: str
+    maj: str
+    id_secteur: int

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@ -48,10 +106,16 @@ class SpipObject:
        self.statut: str = "false" if self.statut == "publie" else "true"
        self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
        self.extra: str = convert(self.extra)  # Probably unused
-        # Define file prefix (need to be changed later)
+        # Define file prefix (needs to be redefined for sections)
        self.prefix = "index"

-    def documents(self) -> ModelSelect:
+    # Convert SPIP style internal links for images & other files into Markdown style
+    def link_documents(self, documents: ModelSelect) -> None:
+        for d in documents:
+            self.texte = link_document(self.texte, d.id_document, d.titre, d.slug())
+
+    # Output related documents & link them in the text by the way
+    def documents(self, link_documents: bool = True) -> ModelSelect:
        documents = (
            Document.select()
            .join(
@ -60,23 +124,44 @@ class SpipObject:
            )
            .where(SpipDocumentsLiens.id_objet == self.id)
        )
-        for d in documents:
-            self.texte = link_document(self.texte, d.id_document, d.titre, d.slug())
-        # Internal (articles) links
-        self.text = link_articles(self.texte)
+        if link_documents:
+            self.link_documents(documents)
        return documents

-    def slug(self, date: bool = False) -> str:
-        return slugify((self.date + "-" if date else "") + self.titre)
+    # Convert SPIP style internal links for other articles or sections into Markdown
+    def link_articles(self) -> None:
+        for match in finditer(r"\[(.*?)]\((?:art|article)([0-9]+)\)", self.texte):
+            article = Article.get(Article.id_article == match.group(2))
+            if len(match.group(1)) > 0:
+                title: str = match.group(1)
+            else:
+                title: str = article.titre
+            self.texte = self.texte.replace(
+                match.group(0), f"[{title}]({article.slug()}/{article.filename()})"
+            )

+    # Output related articles
+    def articles(self) -> ModelSelect:
+        return (
+            Article.select()
+            .where(Article.id_rubrique == self.id)
+            .order_by(Article.date.desc())
+            # .limit(limit)
+        )
+
+    # Get slugified directory of this object
+    def dir_slug(self, include_date: bool = False, end_slash: bool = True) -> str:
+        date: str = self.date + "-" if include_date else ""
+        slash: str = "/" if end_slash else ""
+        return slugify(date + self.titre) + slash
+
+    # Get filename of this object
    def filename(self) -> str:
        return self.prefix + "." + self.lang + "." + config.export_filetype

-    def frontmatter(self) -> str:
-        raise NotImplementedError("Subclasses must implement 'frontmatter' method.")
-
-    def common_frontmatter(self) -> dict[str, Any]:
-        return {
+    # Get the YAML frontmatter string
+    def frontmatter(self, append: dict[str, Any] = {}) -> str:
+        meta: dict[str, Any] = {
            "lang": self.lang,
            "translationKey": self.id_trad,
            "title": self.titre,
@ -88,9 +173,12 @@ class SpipObject:
            "spip_id_secteur": self.id_secteur,
            "spip_id": self.id,
        }
+        return dump(meta | append, allow_unicode=True)

-    def body(self) -> str:
-        body: str = ""
+    # Get file text content
+    def content(self) -> str:
+        # Start the content with frontmatter
+        body: str = "---\n" + self.frontmatter() + "---"
        # Add the title as a Markdown h1
        if len(self.titre) > 0 and config.prepend_h1:
            body += "\n\n# " + self.titre
@ -103,9 +191,10 @@ class SpipObject:
            body += "\n\n# EXTRA\n\n" + self.extra
        return body

-    def content(self) -> str:
-        # Return the final article text
-        return "---\n" + self.frontmatter() + "---" + self.body()
+    # Write object to output destination
+    def write(self, export_dir: str) -> None:
+        with open(export_dir + self.filename(), "w") as f:
+            f.write(self.content())


 class Article(SpipObject, SpipArticles):
@ -122,11 +211,12 @@ class Article(SpipObject, SpipArticles):
        self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
        # ID
        self.id = self.id_article
+        # Terminal output color
+        self.term_color = YELLOW

-    def frontmatter(self) -> str:
-        return dump(
+    def frontmatter(self, append: dict[str, Any] = {}) -> str:
+        return super().frontmatter(
            {
-                **super().common_frontmatter(),
                # Article specific
                "summary": self.chapo,
                "surtitle": self.surtitre,
@ -135,12 +225,11 @@ class Article(SpipObject, SpipArticles):
                "authors": [author.nom for author in self.authors()],
                # Debugging
                "spip_id_rubrique": self.id_rubrique,
-            },
-            allow_unicode=True,
+            }
        )

-    def body(self) -> str:
-        body: str = super().body()
+    def content(self) -> str:
+        body: str = super().content()
        # If there is a caption, add the caption followed by a hr
        if len(self.chapo) > 0:
            body += "\n\n" + self.chapo + "\n\n***"
@ -163,29 +252,6 @@ class Article(SpipObject, SpipArticles):
        )


-# Query the DB to retrieve all articles sorted by publication date
-def get_articles(section_id: int, limit: int = 10**6) -> ModelSelect:
-    return (
-        Article.select()
-        .where(Article.id_rubrique == section_id)
-        .order_by(Article.date.desc())
-        .limit(limit)
-    )
-
-
-def link_articles(text: str):
-    for match in finditer(r"\[(.*?)]\((?:art|article)([0-9]+)\)", text):
-        article = Article.get(Article.id_article == match.group(2))
-        if len(match.group(1)) > 0:
-            title: str = match.group(1)
-        else:
-            title: str = article.titre
-        text = text.replace(
-            match.group(0), f"[{title}]({article.slug()}/{article.filename()})"
-        )
-    return text
-
-
 class Rubrique(SpipObject, SpipRubriques):
    class Meta:
        table_name: str = "spip_rubriques"
@ -196,19 +262,14 @@ class Rubrique(SpipObject, SpipRubriques):
        self.id = self.id_rubrique
        # File prefix
        self.prefix = "_index"
+        # Terminal output color
+        self.term_color = GREEN

-    def frontmatter(self) -> str:
-        return dump(
+    def frontmatter(self, append: dict[str, Any] = {}) -> str:
+        return super().frontmatter(
            {
-                **super().common_frontmatter(),
                # Debugging
                "spip_id_parent": self.id_parent,
                "spip_profondeur": self.profondeur,
-            },
-            allow_unicode=True,
+            }
        )
-
-
-# Query the DB to retrieve all sections sorted by publication date
-def get_sections(limit: int = 10**6) -> ModelSelect:
-    return Rubrique.select().order_by(Rubrique.date.desc()).limit(limit)
--- a/spip2md/styling.py
+++ b/spip2md/styling.py
@ -0,0 +1,46 @@
+# pyright: strict
+# Define styles
+BOLD = 1  # Bold
+ITALIC = 3  # Italic
+UNDER = 4  # Underline
+# Define colors
+RED = 91  # Red
+GREEN = 92  # Green
+YELLOW = 93  # Yellow
+BLUE = 94  # Blue
+C0 = 95  # Color
+C1 = 96  # Color
+C2 = 96  # Color
+
+
+# Print a stylized string, without trailing newline
+def style(string: str, *args: int, end: str = "") -> None:
+    esc = "\033["  # Terminal escape sequence, needs to be closed by "m"
+    if len(args) == 0:
+        params: str = "1;"  # Defaults to bold
+    else:
+        params: str = ""
+    for a in args:
+        params += str(a) + ";"
+    print(esc + params[:-1] + "m" + string + esc + "0m", end=end)
+
+
+# Print a string, highlighting every substring starting at start_stop[x][0] …
+def highlight(string: str, *start_stop: tuple[int, int], end: str = "") -> None:
+    previous_stop = 0
+    for start, stop in start_stop:
+        print(string[previous_stop:start], end="")
+        style(string[start:stop], BOLD, RED)
+        previous_stop = stop
+    print(string[previous_stop:], end=end)
+
+
+# Plural ?
+def ss(nb: int) -> str:
+    return "s" if nb > 1 else ""
+
+
+# Indent with 2 spaces
+def indent(nb: int = 1) -> None:
+    for _ in range(nb):
+        print("  ", end="")