iterate over sections, creating _index.md files. commenting. more context with unknown characters

2023-05-17 14:29:57 +02:00 · 2023-05-17 14:29:57 +02:00 · 6740035958
commit 6740035958
parent d15ad5fd8e
2 changed files with 187 additions and 133 deletions
--- a/spip2md/lib.py
+++ b/spip2md/lib.py
@ -9,29 +9,92 @@ from database import SpipArticles, SpipAuteurs, SpipAuteursLiens, SpipRubriques
 # from yaml import CDumper as Dumper
 FILETYPE: str = "md"
-class Article:
+
-    def __init__(self, article):
+class Item:
    id: int
    def __init__(self, item) -> None:
        self.title: str = convert_meta(item.titre)
        self.section_id: int = item.id_rubrique
        self.description: str = convert_meta(item.descriptif)
        self.text: str = convert_body(item.texte)  # Markdown
        self.publication: str = item.date
        self.draft: bool = item.statut == "publie"
        self.sector_id: int = item.id_secteur
        self.update: str = item.maj
        self.lang: str = item.lang
        self.set_lang: bool = item.langue_choisie  # TODO Why ?
        self.translation_key: int = item.id_trad
        self.extra: str = item.extra  # Probably unused
    def get_slug(self, date: bool = False) -> str:
        return slugify(f"{self.publication if date else ''}-{self.title}")
    def get_filename(self) -> str:
        return "index" + "." + self.lang + "." + FILETYPE
    def get_frontmatter(self) -> str:
        return dump(
            {
                "lang": self.lang,
                "translationKey": self.translation_key,
                "title": self.title,
                "publishDate": self.publication,
                "lastmod": self.update,
                "draft": self.draft,
                "description": self.description,
                # Debugging
                "spip_id": self.id,
                "spip_id_secteur": self.sector_id,
            },
            allow_unicode=True,
        )
    def get_content(self) -> str:
        # Build the final article text
        article: str = "---\n" + self.get_frontmatter() + "---"
        # If there is a caption, add the caption followed by a hr
        if hasattr(self, "caption") and len(self.caption) > 0:
            article += "\n\n" + self.caption + "\n\n***"
        # Add the title as a Markdown h1
        if len(self.title) > 0:
            article += "\n\n# " + self.title
        # If there is a text, add the text preceded by two line breaks
        if len(self.text) > 0:
            article += "\n\n" + self.text
        # Same with an "extra" section
        if self.extra is not None and len(self.extra) > 0:
            article += "\n\n# EXTRA\n\n" + self.extra
        # PS
        if hasattr(self, "ps") and len(self.ps) > 0:
            article += "\n\n# POST-SCRIPTUM\n\n" + self.ps
        # Microblog
        if hasattr(self, "microblog") and len(self.microblog) > 0:
            article += "\n\n# MICROBLOGGING\n\n" + self.microblog
        return article
    def get_unknown_chars(self) -> list[str]:
        errors: list[str] = []
        for text in (self.title, self.text):
            for char in unknown_iso:
                for match in finditer(r".{0-20}" + char + r".*(?=\r?\n|$)", text):
                    errors.append(match.group())
        return errors
 class Article(Item):
    def __init__(self, article) -> None:
        super().__init__(article)
        self.id: int = article.id_article
        self.surtitle: str = article.surtitre  # Probably unused
        self.title: str = convert_meta(article.titre)
        self.subtitle: str = article.soustitre  # Probably unused
        self.section_id: int = article.id_rubrique
        self.description: str = convert_meta(article.descriptif)
        self.caption: str = article.chapo  # Probably unused
        self.text: str = convert_body(article.texte)  # Markdown
        self.ps: str = article.ps  # Probably unused
        self.publication: str = article.date
        self.draft: bool = False if article.statut == "publie" else True
        self.sector_id: int = article.id_secteur
        self.update: str = article.maj
        self.update_2: str = article.date_modif  # Probably unused duplicate of maj
        self.creation: str = article.date_redac
        self.forum: bool = article.accepter_forum  # TODO Why ?
        self.lang: str = article.lang
        self.set_lang: bool = article.langue_choisie  # TODO Why ?
        self.translation_key: int = article.id_trad
        self.extra: str = article.extra  # Probably unused
        self.sitename: str = article.nom_site  # Probably useless
        self.virtual: str = article.virtuel  # TODO Why ?
        self.microblog: str = article.microblog  # Probably unused
@ -41,19 +104,6 @@ class Article:
        # self.popularity: float = article.popularite  # USELESS in static
        # self.version = article.id_version  # USELESS
    def get_section(self) -> str:
        return convert_meta(
            SpipRubriques.select()
            .where(SpipRubriques.id_rubrique == self.section_id)[0]
            .titre
        )
    def get_path(self) -> str:
        return slugify(self.get_section()) + "/" + slugify(f"{self.title}") + "/"
    def get_filename(self) -> str:
        return "index." + self.lang + ".md"
    def get_authors(self) -> tuple:
        return (
            SpipAuteurs.select()
@ -87,112 +137,79 @@ class Article:
            allow_unicode=True,
        )
    def get_article(self) -> str:
        # Build the final article text
        article: str = "---\n" + self.get_frontmatter() + "---"
        # If there is a caption, add the caption followed by a hr
        if len(self.caption) > 0:
            article += "\n\n" + self.caption + "\n\n***"
        # Add the title as a Markdown h1
        if len(self.title) > 0:
            article += "\n\n# " + self.title
        # If there is a text, add the text preceded by two line breaks
        if len(self.text) > 0:
            article += "\n\n" + self.text
        # Same with an "extra" section
        if self.extra is not None and len(self.extra) > 0:
            article += "\n\n# EXTRA\n\n" + self.extra
        # PS
        if len(self.ps) > 0:
            article += "\n\n# POST-SCRIPTUM\n\n" + self.ps
        # Microblog
        if len(self.microblog) > 0:
            article += "\n\n# MICROBLOGGING\n\n" + self.microblog
        return article
-    def get_unknown_chars(self) -> list[str]:
+class Section(Item):
        errors: list[str] = []
        for text in (self.title, self.text):
            for char in unknown_iso:
                for match in finditer(char + r".*(?=\r?\n|$)", text):
                    errors.append(match.group())
        return errors
 class Section:
    def __init__(self, section) -> None:
        super().__init__(section)
        self.id: int = section.id_rubrique
        self.parent_id: int = section.id_parent
        self.title: str = convert_meta(section.titre)
        self.description: str = convert_meta(section.descriptif)
        self.text: str = convert_body(section.texte)  # Markdown
        self.sector_id: int = section.id_secteur
        self.update: str = section.maj
        self.publication: str = section.date
        self.draft: bool = False if section.statut == "publie" else True
        self.lang: str = section.lang
        self.lang_set: bool = False if section.langue_choisie == "oui" else True
        self.extra: str = section.extra  # Probably unused
        self.translation_key: int = section.id_trad
        self.depth: int = section.profondeur
        self.agenda: int = section.agenda
-    def get_articles(self, limit: int):
+    def get_articles(self, limit: int = 0):
-        return Articles(limit)
+        return Articles(self.id, limit)
-class Articles:
+class LimitCounter:
-    exported: int = 0
+    count: int
    LIMIT: int
    def __init__(self, limit: int) -> None:
-        # Query the DB to retrieve all articles sorted by publication date
+        self.count = -1
-        self.articles = (
+        self.LIMIT = limit
            SpipArticles.select().order_by(SpipArticles.date.desc()).limit(limit)
        )
        self.toExport: int = len(self.articles)
-    def remaining(self):
+    def remaining(self) -> int:
-        return self.toExport - self.exported
+        return self.LIMIT - self.count
    def step(self) -> int:
        self.count += 1
        if self.remaining() <= 0:
            raise StopIteration
        return self.count
 class Items:
    items: list
    def __init__(self) -> None:
        # Set a counter caped at the number of retrieved items
        self.count = LimitCounter(len(self.items))
    def __iter__(self):
        return self
-    def __next__(self):
+    def __len__(self) -> int:
-        if self.remaining() <= 0:
+        return self.count.LIMIT
            raise StopIteration
        self.exported += 1
        article = Article(self.articles[self.exported - 1])
        return (
            {"exported": self.exported, "remaining": self.remaining()},
            article,
        )
-class Sections:
+class Articles(Items):
-    exported: int = 0
+    def __init__(self, section_id: int, limit: int = 0) -> None:
    def __init__(self, limit: int = 0) -> None:
        # Query the DB to retrieve all articles sorted by publication date
        if limit > 0:
-            self.articles = (
+            self.items = (
-                SpipArticles.select().order_by(SpipArticles.date.desc()).limit(limit)
+                SpipArticles.select()
                .where(SpipArticles.id_rubrique == section_id)
                .order_by(SpipArticles.date.desc())
                .limit(limit)
            )
        else:
-            self.articles = SpipArticles.select().order_by(SpipArticles.date.desc())
+            self.items = SpipArticles.select().order_by(SpipArticles.date.desc())
-        self.toExport: int = len(self.articles)
+        super().__init__()
    def remaining(self):
        return self.toExport - self.exported
    def __iter__(self):
        return self
    def __next__(self):
-        if self.remaining() <= 0:
+        return (Article(self.items[self.count.step()]), self.count)
-            raise StopIteration
+
-        self.exported += 1
+
-        section = Section(self.articles[self.exported - 1])
+class Sections(Items):
-        return (
+    def __init__(self, limit: int = 0) -> None:
-            {"exported": self.exported, "remaining": self.remaining()},
+        # Query the DB to retrieve all sections sorted by publication date
-            section,
+        if limit > 0:
-        )
+            self.items = (
                SpipRubriques.select().order_by(SpipRubriques.date.desc()).limit(limit)
            )
        else:
            self.items = SpipRubriques.select().order_by(SpipRubriques.date.desc())
        super().__init__()
    def __next__(self):
        return (Section(self.items[self.count.step()]), self.count)
--- a/spip2md/main.py
+++ b/spip2md/main.py
@ -1,8 +1,8 @@
 #!python
 # pyright: strict
 import sys
 from os import makedirs, mkdir
 from shutil import rmtree
 from sys import argv
 from config import config
 from converter import highlight_unknown_chars
@ -20,14 +20,14 @@ RESET: str = "\033[0m"
 db.init(config.db, host=config.db_host, user=config.db_user, password=config.db_pass)
 db.connect()
-if __name__ == "__main__":
+if __name__ == "__main__":  # Following is executed only if script is directly executed
-    # Define max nb of articles to export based on first CLI param
+    # Define max nb of articles to export based on first CLI argument
-    if len(sys.argv) > 1:
+    if len(argv) > 1:
-        maxexport = int(sys.argv[1])
+        maxexport = int(argv[1])
    else:
        maxexport = config.default_export_nb
-    # Clean the output dir & create a new
+    # Clear the output dir & create a new
    rmtree(config.output_dir, True)
    mkdir(config.output_dir)
@ -35,28 +35,64 @@ if __name__ == "__main__":
    unknown_chars_articles: list[Article] = []
    # Loop among first maxexport articles & export them
-    for counter, section in Sections():
+    for section, counter in Sections():
-        for counter, article in section.get_articles(maxexport):
+        # Print the name of the exported section & number of remaining sections
-            if (counter["exported"] - 1) % 100 == 0:
+        print(
-                print(
+            f"{BOLD}{counter.count}. {RESET}"
-                    f"\n{BOLD}Exporting {R}{counter['remaining']+1}{RESET}"
+            + highlight_unknown_chars(section.title, R, RESET),
-                    + f"{BOLD} SPIP articles to Markdown & YAML files{RESET}\n"
+            end="",
-                )
+        )
-            empty: str = "EMPTY " if len(article.text) < 1 else ""
+        if counter.remaining() > 2:
            print(
-                f"{BOLD}{counter['exported']}. {empty}{RESET}"
+                f"   {BOLD}{R}{counter.remaining()-1}{RESET} {BOLD}sections left"
                + RESET,
            )
        else:
            print()
        # Define the section’s path (directory) & create directory(ies) if needed
        sectiondir: str = config.output_dir + "/" + section.get_slug()
        makedirs(sectiondir, exist_ok=True)
        # Define the section filename & write the index at that filename
        sectionpath: str = sectiondir + "/" + section.get_filename()
        with open(sectionpath, "w") as f:
            f.write(section.get_content())
        # Loop over section’s articles
        articles = section.get_articles(maxexport)
        maxexport -= len(articles)
        for article, counter in articles:
            # Print the remaining number of articles to export every 100 articles
            if counter.count % 100 == 0:
                s: str = "s" if counter.remaining() > 1 else ""
                print(
                    f"  {BOLD}Exporting {R}{counter.remaining()}{RESET}"
                    + f"{BOLD} SPIP article{s}{RESET} to Markdown & YAML files\n"
                )
            # Print the title of the article being exported
            print(
                f"  {BOLD}{counter.count + 1}. "
                + ("EMPTY " if len(article.text) < 1 else "")
                + RESET
                + highlight_unknown_chars(article.title, R, RESET)
            )
-            fullpath: str = config.output_dir + "/" + article.get_path()
+            # Define the full article path & create directory(ies) if needed
-            print(f"{BOLD}>{RESET} {fullpath}{article.get_filename()}")
+            articledir: str = sectiondir + "/" + article.get_slug()
-            makedirs(fullpath, exist_ok=True)
+            makedirs(articledir, exist_ok=True)
-            with open(fullpath + article.get_filename(), "w") as f:
+            # Define the article filename & write the article at the filename
-                f.write(article.get_article())
+            articlepath: str = articledir + "/" + article.get_filename()
            with open(articlepath, "w") as f:
                f.write(article.get_content())
            # Store detected unknown characters
            if len(article.get_unknown_chars()) > 0:
                unknown_chars_articles.append(article)
            # Print the outputted file’s path when finished exporting the article
            print(f"  {BOLD}Article>{RESET} {articlepath}")
        # Print the outputted file’s path when finished exporting the section
        print(f"\n{BOLD}Section>{RESET} {sectionpath}\n")
    # Loop through each article that contains an unknown character
    for article in unknown_chars_articles:
        # Print the title of the article in which there is unknown characters
        # & the number of them
        unknown_chars_apparitions: list[str] = article.get_unknown_chars()
        nb: int = len(unknown_chars_apparitions)
        s: str = "s" if nb > 1 else ""
@ -64,7 +100,8 @@ if __name__ == "__main__":
            f"\n{BOLD}{nb}{RESET} unknown character{s} in {BOLD}{article.lang}{RESET} "
            + highlight_unknown_chars(article.title, R, RESET)
        )
        # Print the context in which the unknown characters are found
        for text in unknown_chars_apparitions:
            print(f"  {BOLD}…{RESET} " + highlight_unknown_chars(text, R, RESET))
-    db.close()  # Close the database connection
+    db.close()  # Close the connection with the database