refactoring

This commit is contained in:
Guilhem Fauré 2023-05-22 09:38:18 +02:00
parent a0fd7f0efa
commit a83ec1da3c
4 changed files with 78 additions and 60 deletions

View File

@ -14,7 +14,7 @@ class Configuration:
db_user = "spip" db_user = "spip"
db_pass = "password" db_pass = "password"
output_dir = "output" output_dir = "output"
default_export_nb = 1000 default_export_max = 1000
def __init__(self, config_file: Optional[str] = None) -> None: def __init__(self, config_file: Optional[str] = None) -> None:
if config_file is not None: if config_file is not None:
@ -29,7 +29,7 @@ class Configuration:
if "output_dir" in config: if "output_dir" in config:
self.output_dir = config["output_dir"] self.output_dir = config["output_dir"]
if "default_export_nb" in config: if "default_export_nb" in config:
self.default_export_nb = config["default_export_nb"] self.default_export_max = config["default_export_max"]
config = Configuration() config = Configuration()

View File

@ -288,3 +288,11 @@ def highlight_unknown_chars(text: str, pre: str, post: str) -> str:
text[: match.start()] + pre + match.group() + post + text[match.end() :] text[: match.start()] + pre + match.group() + post + text[match.end() :]
) )
return text return text
def get_unknown_chars(text: str) -> list[str]:
errors: list[str] = []
for char in unknown_iso:
for match in finditer(r".{0-20}" + char + r".*(?=\r?\n|$)", text):
errors.append(match.group())
return errors

View File

@ -1,10 +1,10 @@
# pyright: basic # pyright: basic
from re import finditer from typing import Any, Optional
from slugify import slugify from slugify import slugify
from yaml import dump from yaml import dump
from converter import convert_body, convert_meta, unknown_iso from converter import convert_body, convert_meta
from database import SpipArticles, SpipAuteurs, SpipAuteursLiens, SpipRubriques from database import SpipArticles, SpipAuteurs, SpipAuteursLiens, SpipRubriques
# from yaml import CDumper as Dumper # from yaml import CDumper as Dumper
@ -35,7 +35,7 @@ class Item:
def get_filename(self) -> str: def get_filename(self) -> str:
return "index" + "." + self.lang + "." + FILETYPE return "index" + "." + self.lang + "." + FILETYPE
def get_frontmatter(self) -> str: def get_frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
return dump( return dump(
{ {
"lang": self.lang, "lang": self.lang,
@ -48,40 +48,29 @@ class Item:
# Debugging # Debugging
"spip_id": self.id, "spip_id": self.id,
"spip_id_secteur": self.sector_id, "spip_id_secteur": self.sector_id,
}, }
| append
if append is not None
else {},
allow_unicode=True, allow_unicode=True,
) )
def get_content(self) -> str: def get_body(self) -> str:
# Build the final article text body: str = ""
article: str = "---\n" + self.get_frontmatter() + "---"
# If there is a caption, add the caption followed by a hr
if hasattr(self, "caption") and len(self.caption) > 0:
article += "\n\n" + self.caption + "\n\n***"
# Add the title as a Markdown h1 # Add the title as a Markdown h1
if len(self.title) > 0: if len(self.title) > 0:
article += "\n\n# " + self.title body += "\n\n# " + self.title
# If there is a text, add the text preceded by two line breaks # If there is a text, add the text preceded by two line breaks
if len(self.text) > 0: if len(self.text) > 0:
article += "\n\n" + self.text body += "\n\n" + self.text
# Same with an "extra" section # Same with an "extra" section
if self.extra is not None and len(self.extra) > 0: if self.extra is not None and len(self.extra) > 0:
article += "\n\n# EXTRA\n\n" + self.extra body += "\n\n# EXTRA\n\n" + self.extra
# PS return body
if hasattr(self, "ps") and len(self.ps) > 0:
article += "\n\n# POST-SCRIPTUM\n\n" + self.ps
# Microblog
if hasattr(self, "microblog") and len(self.microblog) > 0:
article += "\n\n# MICROBLOGGING\n\n" + self.microblog
return article
def get_unknown_chars(self) -> list[str]: def get_content(self) -> str:
errors: list[str] = [] # Return the final article text
for text in (self.title, self.text): return "---\n" + self.get_frontmatter() + "---" + self.get_body()
for char in unknown_iso:
for match in finditer(r".{0-20}" + char + r".*(?=\r?\n|$)", text):
errors.append(match.group())
return errors
class Article(Item): class Article(Item):
@ -115,28 +104,32 @@ class Article(Item):
) )
def get_frontmatter(self) -> str: def get_frontmatter(self) -> str:
return dump( return super().get_frontmatter(
{ {
"lang": self.lang,
"translationKey": self.translation_key,
"title": self.title,
"surtitle": self.surtitle, "surtitle": self.surtitle,
"subtitle": self.subtitle, "subtitle": self.subtitle,
"date": self.creation, "date": self.creation,
"publishDate": self.publication,
"lastmod": self.update,
"draft": self.draft,
"description": self.description,
"authors": [author.nom for author in self.get_authors()], "authors": [author.nom for author in self.get_authors()],
# Debugging # Debugging
"spip_id_article": self.id,
"spip_id_rubrique": self.section_id, "spip_id_rubrique": self.section_id,
"spip_id_secteur": self.sector_id, "spip_id_secteur": self.sector_id,
"spip_chapo": self.caption, "spip_chapo": self.caption,
}, },
allow_unicode=True,
) )
def get_body(self) -> str:
body: str = super().get_body()
# If there is a caption, add the caption followed by a hr
if hasattr(self, "caption") and len(self.caption) > 0:
body += "\n\n" + self.caption + "\n\n***"
# PS
if hasattr(self, "ps") and len(self.ps) > 0:
body += "\n\n# POST-SCRIPTUM\n\n" + self.ps
# Microblog
if hasattr(self, "microblog") and len(self.microblog) > 0:
body += "\n\n# MICROBLOGGING\n\n" + self.microblog
return body
class Section(Item): class Section(Item):
def __init__(self, section) -> None: def __init__(self, section) -> None:
@ -146,6 +139,9 @@ class Section(Item):
self.depth: int = section.profondeur self.depth: int = section.profondeur
self.agenda: int = section.agenda self.agenda: int = section.agenda
def get_filename(self) -> str:
return "_" + super().get_filename()
def get_articles(self, limit: int = 0): def get_articles(self, limit: int = 0):
return Articles(self.id, limit) return Articles(self.id, limit)
@ -168,7 +164,7 @@ class LimitCounter:
return self.count return self.count
class Items: class Iterator:
items: list items: list
def __init__(self) -> None: def __init__(self) -> None:
@ -182,7 +178,7 @@ class Items:
return self.count.LIMIT return self.count.LIMIT
class Articles(Items): class Articles(Iterator):
def __init__(self, section_id: int, limit: int = 0) -> None: def __init__(self, section_id: int, limit: int = 0) -> None:
# Query the DB to retrieve all articles sorted by publication date # Query the DB to retrieve all articles sorted by publication date
if limit > 0: if limit > 0:
@ -193,14 +189,18 @@ class Articles(Items):
.limit(limit) .limit(limit)
) )
else: else:
self.items = SpipArticles.select().order_by(SpipArticles.date.desc()) self.items = (
SpipArticles.select()
.where(SpipArticles.id_rubrique == section_id)
.order_by(SpipArticles.date.desc())
)
super().__init__() super().__init__()
def __next__(self): def __next__(self):
return (Article(self.items[self.count.step()]), self.count) return (Article(self.items[self.count.step()]), self.count)
class Sections(Items): class Sections(Iterator):
def __init__(self, limit: int = 0) -> None: def __init__(self, limit: int = 0) -> None:
# Query the DB to retrieve all sections sorted by publication date # Query the DB to retrieve all sections sorted by publication date
if limit > 0: if limit > 0:

View File

@ -5,9 +5,9 @@ from shutil import rmtree
from sys import argv from sys import argv
from config import config from config import config
from converter import highlight_unknown_chars from converter import get_unknown_chars, highlight_unknown_chars
from database import db from database import db
from lib import Article, Sections from items import Article, Sections
# Define terminal escape sequences to stylize output # Define terminal escape sequences to stylize output
R: str = "\033[91m" R: str = "\033[91m"
@ -22,10 +22,10 @@ db.connect()
if __name__ == "__main__": # Following is executed only if script is directly executed if __name__ == "__main__": # Following is executed only if script is directly executed
# Define max nb of articles to export based on first CLI argument # Define max nb of articles to export based on first CLI argument
if len(argv) > 1: if len(argv) >= 2:
maxexport = int(argv[1]) toexport = int(argv[1])
else: else:
maxexport = config.default_export_nb toexport = config.default_export_max
# Clear the output dir & create a new # Clear the output dir & create a new
rmtree(config.output_dir, True) rmtree(config.output_dir, True)
@ -36,6 +36,10 @@ if __name__ == "__main__": # Following is executed only if script is directly e
# Loop among first maxexport articles & export them # Loop among first maxexport articles & export them
for section, counter in Sections(): for section, counter in Sections():
# Define articles of the sections, limited by toexport
if toexport <= 0:
break
articles = section.get_articles(toexport)
# Print the name of the exported section & number of remaining sections # Print the name of the exported section & number of remaining sections
print( print(
f"{BOLD}{counter.count + 1}. {RESET}" f"{BOLD}{counter.count + 1}. {RESET}"
@ -44,10 +48,15 @@ if __name__ == "__main__": # Following is executed only if script is directly e
) )
if counter.remaining() > 2: if counter.remaining() > 2:
print( print(
f" {BOLD}{R}{counter.remaining()-1}{RESET} {BOLD}sections left" f" {BOLD}{B}{counter.remaining()-1}{RESET} {BOLD}sections left"
+ RESET, + RESET,
end="",
)
if toexport > 1:
print(
f" {BOLD}Export limit is in {R}{toexport}{RESET} articles{RESET}",
end="",
) )
else:
print() print()
# Define the sections path (directory) & create directory(ies) if needed # Define the sections path (directory) & create directory(ies) if needed
sectiondir: str = config.output_dir + "/" + section.get_slug() sectiondir: str = config.output_dir + "/" + section.get_slug()
@ -57,21 +66,19 @@ if __name__ == "__main__": # Following is executed only if script is directly e
with open(sectionpath, "w") as f: with open(sectionpath, "w") as f:
f.write(section.get_content()) f.write(section.get_content())
# Loop over sections articles # Loop over sections articles
articles = section.get_articles(maxexport)
maxexport -= len(articles)
for article, counter in articles: for article, counter in articles:
# Print the remaining number of articles to export every 100 articles # Print the remaining number of articles to export every 100 articles
if counter.count % 100 == 0: if counter.count % 100 == 0:
s: str = "s" if counter.remaining() > 1 else "" s: str = "s" if counter.remaining() > 1 else ""
print( print(
f" {BOLD}Exporting {R}{counter.remaining()}{RESET}" f" {BOLD}Exporting {G}{counter.remaining()}{RESET}"
+ f"{BOLD} SPIP article{s}{RESET} to Markdown & YAML files" + f"{BOLD} SPIP article{s}{RESET} to Markdown & YAML files"
) )
# Print the title of the article being exported # Print the title of the article being exported
print( print(
f" {BOLD}{counter.count + 1}. " f" {BOLD}{counter.count + 1}. "
+ ("EMPTY " if len(article.text) < 1 else "") + ("EMPTY " if len(article.text) < 1 else "")
+ RESET + f"{article.lang} {RESET}"
+ highlight_unknown_chars(article.title, R, RESET) + highlight_unknown_chars(article.title, R, RESET)
) )
# Define the full article path & create directory(ies) if needed # Define the full article path & create directory(ies) if needed
@ -81,19 +88,22 @@ if __name__ == "__main__": # Following is executed only if script is directly e
articlepath: str = articledir + "/" + article.get_filename() articlepath: str = articledir + "/" + article.get_filename()
with open(articlepath, "w") as f: with open(articlepath, "w") as f:
f.write(article.get_content()) f.write(article.get_content())
# Store detected unknown characters # Store articles with unknown characters
if len(article.get_unknown_chars()) > 0: print(f"UNKNOWN CHARS {get_unknown_chars(article.text)}")
if len(get_unknown_chars(article.text)) > 0:
unknown_chars_articles.append(article) unknown_chars_articles.append(article)
# Print the outputted files path when finished exporting the article # Print the outputted files path when finished exporting the article
print(f" {BOLD}Article>{RESET} {articlepath}") print(f" {BOLD}{G}-->{RESET} {articlepath}")
# Print the outputted files path when finished exporting the section # Print the outputted files path when finished exporting the section
print(f"{BOLD}Section>{RESET} {sectionpath}\n") print(f"{BOLD}{B}-->{RESET} {sectionpath}\n")
# Decrement export limit with length of exported section
toexport -= len(articles)
# Loop through each article that contains an unknown character # Loop through each article that contains an unknown character
for article in unknown_chars_articles: for article in unknown_chars_articles:
# Print the title of the article in which there is unknown characters # Print the title of the article in which there is unknown characters
# & the number of them # & the number of them
unknown_chars_apparitions: list[str] = article.get_unknown_chars() unknown_chars_apparitions: list[str] = get_unknown_chars(article.text)
nb: int = len(unknown_chars_apparitions) nb: int = len(unknown_chars_apparitions)
s: str = "s" if nb > 1 else "" s: str = "s" if nb > 1 else ""
print( print(