big refactor, code reusage, sections files export, more compact output, simplifications

2023-05-23 17:45:34 +02:00 · 2023-05-23 17:45:34 +02:00 · 0ce795dc08
commit 0ce795dc08
parent bf6b8d4fe5
3 changed files with 104 additions and 112 deletions
--- a/spip2md/config.py
+++ b/spip2md/config.py
@ -2,7 +2,6 @@
 from os.path import isfile
 from typing import Optional

-# from yaml import CLoader as Loader
 from yaml import Loader, load

 config_paths = ("spip2md.yml", "spip2md.yaml")
@ -20,28 +19,17 @@ class Configuration:
    db_user: str = "spip"
    db_pass: str = "password"
    output_dir: str = "output"
-    default_export_max: int = 1000
+    max_articles_export: int = 1000
+    max_sections_export: int = 500
    data_dir: str = "data"
    clear_output: bool = False

-    def __init__(self, config_file: Optional[str] = None) -> None:
+    def __init__(self, config_file: Optional[str] = None):
        if config_file is not None:
            with open(config_file) as f:
                config = load(f.read(), Loader=Loader)
-            if "db" in config:
-                self.db = config["db"]
-            if "db_user" in config:
-                self.db_user = config["db_user"]
-            if "db_pass" in config:
-                self.db_pass = config["db_pass"]
-            if "output_dir" in config:
-                self.output_dir = config["output_dir"]
-            if "default_export_nb" in config:
-                self.default_export_max = config["default_export_max"]
-            if "data_dir" in config:
-                self.data_dir = config["data_dir"]
-            if "clear_output" in config:
-                self.clear_output = config["clear_output"]
+            for attr in config:
+                setattr(self, attr, config[attr])


 config = Configuration(config_file())
--- a/spip2md/items.py
+++ b/spip2md/items.py
@ -18,33 +18,29 @@ from database import (
 EXPORTTYPE: str = "md"


-class LimitCounter:
-    def __init__(self, limit: int) -> None:
-        self.count: int = -1
-        self.LIMIT: int = limit
-
-    def remaining(self) -> int:
-        return self.LIMIT - self.count
-
-    def step(self) -> int:
-        self.count += 1
-        if self.remaining() <= 0:
-            raise StopIteration
-        return self.count
-
-
 class Iterator:
    items: list[Any]

    def __init__(self) -> None:
-        # Set a counter caped at the number of retrieved items
-        self.count = LimitCounter(len(self.items))
+        # Set the limit at the number of retrieved items
+        self.LIMIT: int = len(self.items)
+        # Start before the first element
+        self.count: int = -1

    def __iter__(self):
        return self

    def __len__(self) -> int:
-        return self.count.LIMIT
+        return self.LIMIT
+
+    def remaining(self) -> int:
+        return self.LIMIT - self.count
+
+    def __next__(self) -> Any:
+        self.count += 1
+        if self.remaining() <= 0:
+            raise StopIteration
+        return self.items[self.count]


 class Document:
@ -72,7 +68,7 @@ class Document:
 class Documents(Iterator):
    def __init__(self, object_id: int) -> None:
        # Query the DB to retrieve all documents related to object of id object_id
-        self.items = (
+        items = (
            SpipDocuments.select()
            .join(
                SpipDocumentsLiens,
@ -80,16 +76,14 @@ class Documents(Iterator):
            )
            .where(SpipDocumentsLiens.id_objet == object_id)
        )
+        self.items: list[Document] = [Document(i) for i in items]
        super().__init__()

-    def __next__(self):
-        return (Document(self.items[self.count.step()]), self.count)
-

 class Item:
    id: int

-    def __init__(self, item: SpipArticles | SpipRubriques) -> None:
+    def __init__(self, item: SpipArticles | SpipRubriques):
        self.title: str = convert_meta(item.titre)
        self.section_id: int = item.id_rubrique
        self.description: str = convert_meta(item.descriptif)
@ -139,7 +133,7 @@ class Item:
            # Convert images & files links
            text: str = convert_documents(
                self.text,
-                [(d.id, d.title, d.get_slug()) for d, _ in self.get_documents()],
+                [(d.id, d.title, d.get_slug()) for d in self.get_documents()],
            )
            # Remove remaining HTML after & append to body
            body += "\n\n" + remove_tags(text)
@ -157,7 +151,7 @@ class Item:


 class Article(Item):
-    def __init__(self, article: SpipArticles) -> None:
+    def __init__(self, article: SpipArticles):
        super().__init__(article)
        self.id: int = article.id_article
        self.surtitle: str = convert_meta(article.surtitre)  # Probably unused
@ -218,7 +212,7 @@ class Article(Item):


 class Section(Item):
-    def __init__(self, section: SpipRubriques) -> None:
+    def __init__(self, section: SpipRubriques):
        super().__init__(section)
        self.id: int = section.id_rubrique
        self.parent_id: int = section.id_parent
@ -233,37 +227,33 @@ class Section(Item):


 class Articles(Iterator):
-    def __init__(self, section_id: int, limit: int = 0) -> None:
+    def __init__(self, section_id: int, limit: int = 0):
        # Query the DB to retrieve all articles sorted by publication date
        if limit > 0:
-            self.items = (
+            items = (
                SpipArticles.select()
                .where(SpipArticles.id_rubrique == section_id)
                .order_by(SpipArticles.date.desc())
                .limit(limit)
            )
        else:
-            self.items = (
+            items = (
                SpipArticles.select()
                .where(SpipArticles.id_rubrique == section_id)
                .order_by(SpipArticles.date.desc())
            )
+        self.items: list[Article] = [Article(i) for i in items]
        super().__init__()

-    def __next__(self):
-        return (Article(self.items[self.count.step()]), self.count)
-

 class Sections(Iterator):
-    def __init__(self, limit: int = 0) -> None:
+    def __init__(self, limit: int = 0):
        # Query the DB to retrieve all sections sorted by publication date
        if limit > 0:
-            self.items = (
+            items = (
                SpipRubriques.select().order_by(SpipRubriques.date.desc()).limit(limit)
            )
        else:
-            self.items = SpipRubriques.select().order_by(SpipRubriques.date.desc())
+            items = SpipRubriques.select().order_by(SpipRubriques.date.desc())
+        self.items: list[Section] = [Section(i) for i in items]
        super().__init__()
-
-    def __next__(self):
-        return (Section(self.items[self.count.step()]), self.count)
--- a/spip2md/main.py
+++ b/spip2md/main.py
@ -10,10 +10,7 @@ from converter import get_unknown_chars, unknown_chars
 from database import db
 from items import (
    Article,
-    Articles,
    Document,
-    Documents,
-    LimitCounter,
    Section,
    Sections,
 )
@ -55,27 +52,30 @@ def highlight(string: str, *start_stop: tuple[int, int]) -> None:
    print(string[previous_stop:], end="")


+# Plural ?
+def s(nb: int) -> str:
+    return "s" if nb > 1 else ""
+
+
+# Indent with spaces
+def indent(nb: int = 1) -> None:
+    for _ in range(nb):
+        print("  ", end="")
+
+
 # Connect to the MySQL database with Peewee ORM
 db.init(config.db, host=config.db_host, user=config.db_user, password=config.db_pass)
 db.connect()


 # Output information about ongoing export & write section to output destination
-def write_section(
-    section: Section, counter: LimitCounter
-) -> tuple[Articles, Documents, str]:
+def write_section(index: int, total: int, section: Section) -> str:
    # Print the name of the exported section & number of remaining sections
-    style(f"{counter.count + 1}. ", BO)
+    style(f"{index + 1}. ", BO)
    highlight(section.title, *unknown_chars(section.title))
-    if counter.remaining() > 2:
-        style(f"   {counter.remaining()-1}", BO, G)
-        style(" sections")
-        print(" left to export", end="")
-    if toexport > 1:
-        style(f"   {toexport}", BO, Y)
-        style(" articles")
-        print(" left before export limit", end="")
-    print()
+    style(f"   {total-index-1}", BO, G)
+    style(f" section{s(total-index)}")
+    print(" left to export")
    # Define the section’s path (directory) & create directory(ies) if needed
    sectiondir: str = config.output_dir + "/" + section.get_slug()
    makedirs(sectiondir, exist_ok=True)
@ -84,29 +84,26 @@ def write_section(
    with open(sectionpath, "w") as f:
        f.write(section.get_content())
    # Return the first "limit" articles of section
-    return (section.get_articles(), section.get_documents(), sectiondir)
+    return sectiondir


 # Output information about ongoing export & write article to output destination
-def write_article(
-    article: Article, counter: LimitCounter, sectiondir: str
-) -> tuple[Documents, str]:
+def write_article(index: int, total: int, article: Article, sectiondir: str) -> str:
    # Print the remaining number of articles to export every 100 articles
-    if counter.count % 100 == 0:
-        s: str = "s" if counter.remaining() > 1 else ""
-        print("  Exporting", end="")
-        style(f" {counter.remaining()}", BO, Y)
+    if index % 100 == 0:
+        indent()
+        print("Exporting", end="")
+        style(f" {total-index}", BO, Y)
        print(" SPIP", end="")
-        style(f" article{s}")
+        style(f" article{s(total-index)}")
        print(" to Markdown & YAML files")
    # Print the title of the article being exported
    style(
-        f"  {counter.count + 1}. "
+        f"  {index + 1}. "
        + ("EMPTY " if len(article.text) < 1 else "")
        + f"{article.lang} "
    )
    highlight(article.title, *unknown_chars(article.title))
-    print()
    # Define the full article path & create directory(ies) if needed
    articledir: str = sectiondir + "/" + article.get_slug()
    makedirs(articledir, exist_ok=True)
@ -114,43 +111,46 @@ def write_article(
    articlepath: str = articledir + "/" + article.get_filename()
    with open(articlepath, "w") as f:
        f.write(article.get_content())
-    # Store articles with unknown characters
-    if len(get_unknown_chars(article.text)) > 0:
-        unknown_chars_articles.append(article)
-    return (article.get_documents(), articledir)
+    # Print export location when finished exporting
+    style(" -> ", BO, G)
+    print(articlepath)
+    return articledir


 # Output information about ongoing export & copy document to output destination
-def write_document(document: Document, counter: LimitCounter, objectdir: str) -> None:
-    if counter.count % 100 == 0:
-        s: str = "s" if counter.remaining() > 1 else ""
-        print("    Exporting", end="")
-        style(f" {counter.remaining()}", BO, B)
-        style(f" document{s}")
-        print(" in this article")
+def write_document(
+    index: int, total: int, document: Document, objectdir: str, indent_depth: int = 1
+) -> None:
+    if index % 100 == 0:
+        indent(indent_depth)
+        print("Exporting", end="")
+        style(f" {total-index}", BO, B)
+        style(f" document{s(total-index)}\n")
    # Print the name of the file with a counter
-    style(f"    {counter.count + 1}. {document.media} ")
+    indent(indent_depth)
+    style(f"{index + 1}. {document.media} ")
    if len(document.title) > 0:
        highlight(document.title + " ", *unknown_chars(document.title))
    style("at ")
-    print(document.file)
+    print(document.file, end="")
    # Define document path
    documentpath: str = expanduser(config.data_dir + "/" + document.file)
    # Copy the document from it’s SPIP location to the new location
    try:
        copyfile(documentpath, objectdir + "/" + document.get_slug())
    except FileNotFoundError:
-        style("    NOT FOUND: ", BO, R)
-        print(documentpath)
+        style(" -> NOT FOUND!\n", BO, R)
    else:
        # Print the outputted file’s path when copied the file
-        style("    -->", BO, B)
+        style(" ->", BO, B)
        print(f" {objectdir}/{document.get_slug()}")


 # Return true if an article field contains an unknown character
 def has_unknown_chars(article: Article) -> bool:
+    if len(get_unknown_chars(article.text)) > 0:
        return True
+    return False


 # Print the detected unknown chars in article in their context but highlighted
@ -177,9 +177,14 @@ def warn_unknown_chars(article: Article) -> None:
 if __name__ == "__main__":
    # Define max nb of articles to export based on first CLI argument
    if len(argv) >= 2:
-        toexport = int(argv[1])
+        max_articles_export = int(argv[1])
    else:
-        toexport = config.default_export_max
+        max_articles_export = config.max_articles_export
+    # Define max nb of sections to export based on second CLI argument
+    if len(argv) >= 3:
+        max_sections_export = int(argv[2])
+    else:
+        max_sections_export = config.max_sections_export

    # Clear the output dir & create a new
    if config.clear_output:
@ -189,24 +194,33 @@ if __name__ == "__main__":
    # Make a list containing articles where unknown characters are detected
    unknown_chars_articles: list[Article] = []

-    # Loop among first maxexport articles & export them
-    for section, counter in Sections(toexport):
+    # Get sections with an eventual maximum
+    sections = Sections(max_sections_export)
+    nb_sections_export: int = len(sections)
+
+    # Loop among sections & export them
+    for i, section in enumerate(sections):
        # Write the section & store its articles
-        articles, documents, sectiondir = write_section(section, counter)
+        sectiondir = write_section(i, nb_sections_export, section)
        # Loop over section’s related files (images …)
-        for document, counter in documents:
-            write_document(document, counter, sectiondir)
+        documents = section.get_documents()
+        for i, document in enumerate(documents):
+            write_document(i, len(documents), document, sectiondir)
        # Loop over section’s articles
-        for article, counter in articles:
-            documents, articledir = write_article(article, counter, sectiondir)
+        articles = section.get_articles(max_articles_export)
+        for i, article in enumerate(articles):
+            articledir = write_article(i, len(articles), article, sectiondir)
            # Add article to unknown_chars_articles if needed
            if has_unknown_chars(article):
                unknown_chars_articles.append(article)
+            # Decrement export limit
+            max_articles_export -= 1
            # Loop over article’s related files (images …)
-            for document, counter in documents:
-                write_document(document, counter, articledir)
-        # Break 2 lines when finished exporting the section
-        print("\n")
+            documents = section.get_documents()
+            for i, document in enumerate(documents):
+                write_document(i, len(documents), document, sectiondir, 2)
+        # Break line when finished exporting the section
+        print()

    # Loop through each article that contains an unknown character
    for article in unknown_chars_articles: