refactoring

2023-05-22 09:38:18 +02:00 · 2023-05-22 09:38:18 +02:00 · a83ec1da3c
commit a83ec1da3c
parent a0fd7f0efa
4 changed files with 78 additions and 60 deletions
--- a/spip2md/config.py
+++ b/spip2md/config.py
@ -14,7 +14,7 @@ class Configuration:
    db_user = "spip"
    db_pass = "password"
    output_dir = "output"
-    default_export_nb = 1000
+    default_export_max = 1000

    def __init__(self, config_file: Optional[str] = None) -> None:
        if config_file is not None:
@ -29,7 +29,7 @@ class Configuration:
            if "output_dir" in config:
                self.output_dir = config["output_dir"]
            if "default_export_nb" in config:
-                self.default_export_nb = config["default_export_nb"]
+                self.default_export_max = config["default_export_max"]


 config = Configuration()
--- a/spip2md/converter.py
+++ b/spip2md/converter.py
@ -288,3 +288,11 @@ def highlight_unknown_chars(text: str, pre: str, post: str) -> str:
                text[: match.start()] + pre + match.group() + post + text[match.end() :]
            )
    return text
+
+
+def get_unknown_chars(text: str) -> list[str]:
+    errors: list[str] = []
+    for char in unknown_iso:
+        for match in finditer(r".{0-20}" + char + r".*(?=\r?\n|$)", text):
+            errors.append(match.group())
+    return errors
--- a/spip2md/items.py
+++ b/spip2md/items.py
@ -1,10 +1,10 @@
 # pyright: basic
-from re import finditer
+from typing import Any, Optional

 from slugify import slugify
 from yaml import dump

-from converter import convert_body, convert_meta, unknown_iso
+from converter import convert_body, convert_meta
 from database import SpipArticles, SpipAuteurs, SpipAuteursLiens, SpipRubriques

 # from yaml import CDumper as Dumper
@ -35,7 +35,7 @@ class Item:
    def get_filename(self) -> str:
        return "index" + "." + self.lang + "." + FILETYPE

-    def get_frontmatter(self) -> str:
+    def get_frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
        return dump(
            {
                "lang": self.lang,
@ -48,40 +48,29 @@ class Item:
                # Debugging
                "spip_id": self.id,
                "spip_id_secteur": self.sector_id,
-            },
+            }
+            | append
+            if append is not None
+            else {},
            allow_unicode=True,
        )

-    def get_content(self) -> str:
-        # Build the final article text
-        article: str = "---\n" + self.get_frontmatter() + "---"
-        # If there is a caption, add the caption followed by a hr
-        if hasattr(self, "caption") and len(self.caption) > 0:
-            article += "\n\n" + self.caption + "\n\n***"
+    def get_body(self) -> str:
+        body: str = ""
        # Add the title as a Markdown h1
        if len(self.title) > 0:
-            article += "\n\n# " + self.title
+            body += "\n\n# " + self.title
        # If there is a text, add the text preceded by two line breaks
        if len(self.text) > 0:
-            article += "\n\n" + self.text
+            body += "\n\n" + self.text
        # Same with an "extra" section
        if self.extra is not None and len(self.extra) > 0:
-            article += "\n\n# EXTRA\n\n" + self.extra
-        # PS
-        if hasattr(self, "ps") and len(self.ps) > 0:
-            article += "\n\n# POST-SCRIPTUM\n\n" + self.ps
-        # Microblog
-        if hasattr(self, "microblog") and len(self.microblog) > 0:
-            article += "\n\n# MICROBLOGGING\n\n" + self.microblog
-        return article
+            body += "\n\n# EXTRA\n\n" + self.extra
+        return body

-    def get_unknown_chars(self) -> list[str]:
-        errors: list[str] = []
-        for text in (self.title, self.text):
-            for char in unknown_iso:
-                for match in finditer(r".{0-20}" + char + r".*(?=\r?\n|$)", text):
-                    errors.append(match.group())
-        return errors
+    def get_content(self) -> str:
+        # Return the final article text
+        return "---\n" + self.get_frontmatter() + "---" + self.get_body()


 class Article(Item):
@ -115,28 +104,32 @@ class Article(Item):
        )

    def get_frontmatter(self) -> str:
-        return dump(
+        return super().get_frontmatter(
            {
-                "lang": self.lang,
-                "translationKey": self.translation_key,
-                "title": self.title,
                "surtitle": self.surtitle,
                "subtitle": self.subtitle,
                "date": self.creation,
-                "publishDate": self.publication,
-                "lastmod": self.update,
-                "draft": self.draft,
-                "description": self.description,
                "authors": [author.nom for author in self.get_authors()],
                # Debugging
-                "spip_id_article": self.id,
                "spip_id_rubrique": self.section_id,
                "spip_id_secteur": self.sector_id,
                "spip_chapo": self.caption,
            },
-            allow_unicode=True,
        )

+    def get_body(self) -> str:
+        body: str = super().get_body()
+        # If there is a caption, add the caption followed by a hr
+        if hasattr(self, "caption") and len(self.caption) > 0:
+            body += "\n\n" + self.caption + "\n\n***"
+        # PS
+        if hasattr(self, "ps") and len(self.ps) > 0:
+            body += "\n\n# POST-SCRIPTUM\n\n" + self.ps
+        # Microblog
+        if hasattr(self, "microblog") and len(self.microblog) > 0:
+            body += "\n\n# MICROBLOGGING\n\n" + self.microblog
+        return body
+

 class Section(Item):
    def __init__(self, section) -> None:
@ -146,6 +139,9 @@ class Section(Item):
        self.depth: int = section.profondeur
        self.agenda: int = section.agenda

+    def get_filename(self) -> str:
+        return "_" + super().get_filename()
+
    def get_articles(self, limit: int = 0):
        return Articles(self.id, limit)

@ -168,7 +164,7 @@ class LimitCounter:
        return self.count


-class Items:
+class Iterator:
    items: list

    def __init__(self) -> None:
@ -182,7 +178,7 @@ class Items:
        return self.count.LIMIT


-class Articles(Items):
+class Articles(Iterator):
    def __init__(self, section_id: int, limit: int = 0) -> None:
        # Query the DB to retrieve all articles sorted by publication date
        if limit > 0:
@ -193,14 +189,18 @@ class Articles(Items):
                .limit(limit)
            )
        else:
-            self.items = SpipArticles.select().order_by(SpipArticles.date.desc())
+            self.items = (
+                SpipArticles.select()
+                .where(SpipArticles.id_rubrique == section_id)
+                .order_by(SpipArticles.date.desc())
+            )
        super().__init__()

    def __next__(self):
        return (Article(self.items[self.count.step()]), self.count)


-class Sections(Items):
+class Sections(Iterator):
    def __init__(self, limit: int = 0) -> None:
        # Query the DB to retrieve all sections sorted by publication date
        if limit > 0:
--- a/spip2md/main.py
+++ b/spip2md/main.py
@ -5,9 +5,9 @@ from shutil import rmtree
 from sys import argv

 from config import config
-from converter import highlight_unknown_chars
+from converter import get_unknown_chars, highlight_unknown_chars
 from database import db
-from lib import Article, Sections
+from items import Article, Sections

 # Define terminal escape sequences to stylize output
 R: str = "\033[91m"
@ -22,10 +22,10 @@ db.connect()

 if __name__ == "__main__":  # Following is executed only if script is directly executed
    # Define max nb of articles to export based on first CLI argument
-    if len(argv) > 1:
-        maxexport = int(argv[1])
+    if len(argv) >= 2:
+        toexport = int(argv[1])
    else:
-        maxexport = config.default_export_nb
+        toexport = config.default_export_max

    # Clear the output dir & create a new
    rmtree(config.output_dir, True)
@ -36,6 +36,10 @@ if __name__ == "__main__":  # Following is executed only if script is directly e

    # Loop among first maxexport articles & export them
    for section, counter in Sections():
+        # Define articles of the sections, limited by toexport
+        if toexport <= 0:
+            break
+        articles = section.get_articles(toexport)
        # Print the name of the exported section & number of remaining sections
        print(
            f"{BOLD}{counter.count + 1}. {RESET}"
@ -44,11 +48,16 @@ if __name__ == "__main__":  # Following is executed only if script is directly e
        )
        if counter.remaining() > 2:
            print(
-                f"   {BOLD}{R}{counter.remaining()-1}{RESET} {BOLD}sections left"
+                f"   {BOLD}{B}{counter.remaining()-1}{RESET} {BOLD}sections left"
                + RESET,
+                end="",
            )
-        else:
-            print()
+        if toexport > 1:
+            print(
+                f"   {BOLD}Export limit is in {R}{toexport}{RESET} articles{RESET}",
+                end="",
+            )
+        print()
        # Define the section’s path (directory) & create directory(ies) if needed
        sectiondir: str = config.output_dir + "/" + section.get_slug()
        makedirs(sectiondir, exist_ok=True)
@ -57,21 +66,19 @@ if __name__ == "__main__":  # Following is executed only if script is directly e
        with open(sectionpath, "w") as f:
            f.write(section.get_content())
        # Loop over section’s articles
-        articles = section.get_articles(maxexport)
-        maxexport -= len(articles)
        for article, counter in articles:
            # Print the remaining number of articles to export every 100 articles
            if counter.count % 100 == 0:
                s: str = "s" if counter.remaining() > 1 else ""
                print(
-                    f"  {BOLD}Exporting {R}{counter.remaining()}{RESET}"
+                    f"  {BOLD}Exporting {G}{counter.remaining()}{RESET}"
                    + f"{BOLD} SPIP article{s}{RESET} to Markdown & YAML files"
                )
            # Print the title of the article being exported
            print(
                f"  {BOLD}{counter.count + 1}. "
                + ("EMPTY " if len(article.text) < 1 else "")
-                + RESET
+                + f"{article.lang} {RESET}"
                + highlight_unknown_chars(article.title, R, RESET)
            )
            # Define the full article path & create directory(ies) if needed
@ -81,19 +88,22 @@ if __name__ == "__main__":  # Following is executed only if script is directly e
            articlepath: str = articledir + "/" + article.get_filename()
            with open(articlepath, "w") as f:
                f.write(article.get_content())
-            # Store detected unknown characters
-            if len(article.get_unknown_chars()) > 0:
+            # Store articles with unknown characters
+            print(f"UNKNOWN CHARS {get_unknown_chars(article.text)}")
+            if len(get_unknown_chars(article.text)) > 0:
                unknown_chars_articles.append(article)
            # Print the outputted file’s path when finished exporting the article
-            print(f"  {BOLD}Article>{RESET} {articlepath}")
+            print(f"  {BOLD}{G}-->{RESET} {articlepath}")
        # Print the outputted file’s path when finished exporting the section
-        print(f"{BOLD}Section>{RESET} {sectionpath}\n")
+        print(f"{BOLD}{B}-->{RESET} {sectionpath}\n")
+        # Decrement export limit with length of exported section
+        toexport -= len(articles)

    # Loop through each article that contains an unknown character
    for article in unknown_chars_articles:
        # Print the title of the article in which there is unknown characters
        # & the number of them
-        unknown_chars_apparitions: list[str] = article.get_unknown_chars()
+        unknown_chars_apparitions: list[str] = get_unknown_chars(article.text)
        nb: int = len(unknown_chars_apparitions)
        s: str = "s" if nb > 1 else ""
        print(