iterate over sections, creating _index.md files. commenting. more context with unknown characters

This commit is contained in:
Guilhem Fauré 2023-05-17 14:29:57 +02:00
parent d15ad5fd8e
commit 6740035958
2 changed files with 187 additions and 133 deletions

View File

@ -9,29 +9,92 @@ from database import SpipArticles, SpipAuteurs, SpipAuteursLiens, SpipRubriques
# from yaml import CDumper as Dumper
FILETYPE: str = "md"
class Article:
def __init__(self, article):
class Item:
id: int
def __init__(self, item) -> None:
self.title: str = convert_meta(item.titre)
self.section_id: int = item.id_rubrique
self.description: str = convert_meta(item.descriptif)
self.text: str = convert_body(item.texte) # Markdown
self.publication: str = item.date
self.draft: bool = item.statut == "publie"
self.sector_id: int = item.id_secteur
self.update: str = item.maj
self.lang: str = item.lang
self.set_lang: bool = item.langue_choisie # TODO Why?
self.translation_key: int = item.id_trad
self.extra: str = item.extra # Probably unused
def get_slug(self, date: bool = False) -> str:
return slugify(f"{self.publication if date else ''}-{self.title}")
def get_filename(self) -> str:
return "index" + "." + self.lang + "." + FILETYPE
def get_frontmatter(self) -> str:
return dump(
{
"lang": self.lang,
"translationKey": self.translation_key,
"title": self.title,
"publishDate": self.publication,
"lastmod": self.update,
"draft": self.draft,
"description": self.description,
# Debugging
"spip_id": self.id,
"spip_id_secteur": self.sector_id,
},
allow_unicode=True,
)
def get_content(self) -> str:
# Build the final article text
article: str = "---\n" + self.get_frontmatter() + "---"
# If there is a caption, add the caption followed by a hr
if hasattr(self, "caption") and len(self.caption) > 0:
article += "\n\n" + self.caption + "\n\n***"
# Add the title as a Markdown h1
if len(self.title) > 0:
article += "\n\n# " + self.title
# If there is a text, add the text preceded by two line breaks
if len(self.text) > 0:
article += "\n\n" + self.text
# Same with an "extra" section
if self.extra is not None and len(self.extra) > 0:
article += "\n\n# EXTRA\n\n" + self.extra
# PS
if hasattr(self, "ps") and len(self.ps) > 0:
article += "\n\n# POST-SCRIPTUM\n\n" + self.ps
# Microblog
if hasattr(self, "microblog") and len(self.microblog) > 0:
article += "\n\n# MICROBLOGGING\n\n" + self.microblog
return article
def get_unknown_chars(self) -> list[str]:
errors: list[str] = []
for text in (self.title, self.text):
for char in unknown_iso:
for match in finditer(r".{0-20}" + char + r".*(?=\r?\n|$)", text):
errors.append(match.group())
return errors
class Article(Item):
def __init__(self, article) -> None:
super().__init__(article)
self.id: int = article.id_article
self.surtitle: str = article.surtitre # Probably unused
self.title: str = convert_meta(article.titre)
self.subtitle: str = article.soustitre # Probably unused
self.section_id: int = article.id_rubrique
self.description: str = convert_meta(article.descriptif)
self.caption: str = article.chapo # Probably unused
self.text: str = convert_body(article.texte) # Markdown
self.ps: str = article.ps # Probably unused
self.publication: str = article.date
self.draft: bool = False if article.statut == "publie" else True
self.sector_id: int = article.id_secteur
self.update: str = article.maj
self.update_2: str = article.date_modif # Probably unused duplicate of maj
self.creation: str = article.date_redac
self.forum: bool = article.accepter_forum # TODO Why?
self.lang: str = article.lang
self.set_lang: bool = article.langue_choisie # TODO Why?
self.translation_key: int = article.id_trad
self.extra: str = article.extra # Probably unused
self.sitename: str = article.nom_site # Probably useless
self.virtual: str = article.virtuel # TODO Why?
self.microblog: str = article.microblog # Probably unused
@ -41,19 +104,6 @@ class Article:
# self.popularity: float = article.popularite # USELESS in static
# self.version = article.id_version # USELESS
def get_section(self) -> str:
return convert_meta(
SpipRubriques.select()
.where(SpipRubriques.id_rubrique == self.section_id)[0]
.titre
)
def get_path(self) -> str:
return slugify(self.get_section()) + "/" + slugify(f"{self.title}") + "/"
def get_filename(self) -> str:
return "index." + self.lang + ".md"
def get_authors(self) -> tuple:
return (
SpipAuteurs.select()
@ -87,112 +137,79 @@ class Article:
allow_unicode=True,
)
def get_article(self) -> str:
# Build the final article text
article: str = "---\n" + self.get_frontmatter() + "---"
# If there is a caption, add the caption followed by a hr
if len(self.caption) > 0:
article += "\n\n" + self.caption + "\n\n***"
# Add the title as a Markdown h1
if len(self.title) > 0:
article += "\n\n# " + self.title
# If there is a text, add the text preceded by two line breaks
if len(self.text) > 0:
article += "\n\n" + self.text
# Same with an "extra" section
if self.extra is not None and len(self.extra) > 0:
article += "\n\n# EXTRA\n\n" + self.extra
# PS
if len(self.ps) > 0:
article += "\n\n# POST-SCRIPTUM\n\n" + self.ps
# Microblog
if len(self.microblog) > 0:
article += "\n\n# MICROBLOGGING\n\n" + self.microblog
return article
def get_unknown_chars(self) -> list[str]:
errors: list[str] = []
for text in (self.title, self.text):
for char in unknown_iso:
for match in finditer(char + r".*(?=\r?\n|$)", text):
errors.append(match.group())
return errors
class Section:
class Section(Item):
def __init__(self, section) -> None:
super().__init__(section)
self.id: int = section.id_rubrique
self.parent_id: int = section.id_parent
self.title: str = convert_meta(section.titre)
self.description: str = convert_meta(section.descriptif)
self.text: str = convert_body(section.texte) # Markdown
self.sector_id: int = section.id_secteur
self.update: str = section.maj
self.publication: str = section.date
self.draft: bool = False if section.statut == "publie" else True
self.lang: str = section.lang
self.lang_set: bool = False if section.langue_choisie == "oui" else True
self.extra: str = section.extra # Probably unused
self.translation_key: int = section.id_trad
self.depth: int = section.profondeur
self.agenda: int = section.agenda
def get_articles(self, limit: int):
return Articles(limit)
def get_articles(self, limit: int = 0):
return Articles(self.id, limit)
class Articles:
exported: int = 0
class LimitCounter:
count: int
LIMIT: int
def __init__(self, limit: int) -> None:
# Query the DB to retrieve all articles sorted by publication date
self.articles = (
SpipArticles.select().order_by(SpipArticles.date.desc()).limit(limit)
)
self.toExport: int = len(self.articles)
self.count = -1
self.LIMIT = limit
def remaining(self):
return self.toExport - self.exported
def remaining(self) -> int:
return self.LIMIT - self.count
def step(self) -> int:
self.count += 1
if self.remaining() <= 0:
raise StopIteration
return self.count
class Items:
items: list
def __init__(self) -> None:
# Set a counter caped at the number of retrieved items
self.count = LimitCounter(len(self.items))
def __iter__(self):
return self
def __next__(self):
if self.remaining() <= 0:
raise StopIteration
self.exported += 1
article = Article(self.articles[self.exported - 1])
return (
{"exported": self.exported, "remaining": self.remaining()},
article,
)
def __len__(self) -> int:
return self.count.LIMIT
class Sections:
exported: int = 0
def __init__(self, limit: int = 0) -> None:
class Articles(Items):
def __init__(self, section_id: int, limit: int = 0) -> None:
# Query the DB to retrieve all articles sorted by publication date
if limit > 0:
self.articles = (
SpipArticles.select().order_by(SpipArticles.date.desc()).limit(limit)
self.items = (
SpipArticles.select()
.where(SpipArticles.id_rubrique == section_id)
.order_by(SpipArticles.date.desc())
.limit(limit)
)
else:
self.articles = SpipArticles.select().order_by(SpipArticles.date.desc())
self.toExport: int = len(self.articles)
def remaining(self):
return self.toExport - self.exported
def __iter__(self):
return self
self.items = SpipArticles.select().order_by(SpipArticles.date.desc())
super().__init__()
def __next__(self):
if self.remaining() <= 0:
raise StopIteration
self.exported += 1
section = Section(self.articles[self.exported - 1])
return (
{"exported": self.exported, "remaining": self.remaining()},
section,
return (Article(self.items[self.count.step()]), self.count)
class Sections(Items):
def __init__(self, limit: int = 0) -> None:
# Query the DB to retrieve all sections sorted by publication date
if limit > 0:
self.items = (
SpipRubriques.select().order_by(SpipRubriques.date.desc()).limit(limit)
)
else:
self.items = SpipRubriques.select().order_by(SpipRubriques.date.desc())
super().__init__()
def __next__(self):
return (Section(self.items[self.count.step()]), self.count)

View File

@ -1,8 +1,8 @@
#!python
# pyright: strict
import sys
from os import makedirs, mkdir
from shutil import rmtree
from sys import argv
from config import config
from converter import highlight_unknown_chars
@ -20,14 +20,14 @@ RESET: str = "\033[0m"
db.init(config.db, host=config.db_host, user=config.db_user, password=config.db_pass)
db.connect()
if __name__ == "__main__":
# Define max nb of articles to export based on first CLI param
if len(sys.argv) > 1:
maxexport = int(sys.argv[1])
if __name__ == "__main__": # Following is executed only if script is directly executed
# Define max nb of articles to export based on first CLI argument
if len(argv) > 1:
maxexport = int(argv[1])
else:
maxexport = config.default_export_nb
# Clean the output dir & create a new
# Clear the output dir & create a new
rmtree(config.output_dir, True)
mkdir(config.output_dir)
@ -35,28 +35,64 @@ if __name__ == "__main__":
unknown_chars_articles: list[Article] = []
# Loop among first maxexport articles & export them
for counter, section in Sections():
for counter, article in section.get_articles(maxexport):
if (counter["exported"] - 1) % 100 == 0:
for section, counter in Sections():
# Print the name of the exported section & number of remaining sections
print(
f"\n{BOLD}Exporting {R}{counter['remaining']+1}{RESET}"
+ f"{BOLD} SPIP articles to Markdown & YAML files{RESET}\n"
f"{BOLD}{counter.count}. {RESET}"
+ highlight_unknown_chars(section.title, R, RESET),
end="",
)
empty: str = "EMPTY " if len(article.text) < 1 else ""
if counter.remaining() > 2:
print(
f"{BOLD}{counter['exported']}. {empty}{RESET}"
f" {BOLD}{R}{counter.remaining()-1}{RESET} {BOLD}sections left"
+ RESET,
)
else:
print()
# Define the sections path (directory) & create directory(ies) if needed
sectiondir: str = config.output_dir + "/" + section.get_slug()
makedirs(sectiondir, exist_ok=True)
# Define the section filename & write the index at that filename
sectionpath: str = sectiondir + "/" + section.get_filename()
with open(sectionpath, "w") as f:
f.write(section.get_content())
# Loop over sections articles
articles = section.get_articles(maxexport)
maxexport -= len(articles)
for article, counter in articles:
# Print the remaining number of articles to export every 100 articles
if counter.count % 100 == 0:
s: str = "s" if counter.remaining() > 1 else ""
print(
f" {BOLD}Exporting {R}{counter.remaining()}{RESET}"
+ f"{BOLD} SPIP article{s}{RESET} to Markdown & YAML files\n"
)
# Print the title of the article being exported
print(
f" {BOLD}{counter.count + 1}. "
+ ("EMPTY " if len(article.text) < 1 else "")
+ RESET
+ highlight_unknown_chars(article.title, R, RESET)
)
fullpath: str = config.output_dir + "/" + article.get_path()
print(f"{BOLD}>{RESET} {fullpath}{article.get_filename()}")
makedirs(fullpath, exist_ok=True)
with open(fullpath + article.get_filename(), "w") as f:
f.write(article.get_article())
# Define the full article path & create directory(ies) if needed
articledir: str = sectiondir + "/" + article.get_slug()
makedirs(articledir, exist_ok=True)
# Define the article filename & write the article at the filename
articlepath: str = articledir + "/" + article.get_filename()
with open(articlepath, "w") as f:
f.write(article.get_content())
# Store detected unknown characters
if len(article.get_unknown_chars()) > 0:
unknown_chars_articles.append(article)
# Print the outputted files path when finished exporting the article
print(f" {BOLD}Article>{RESET} {articlepath}")
# Print the outputted files path when finished exporting the section
print(f"\n{BOLD}Section>{RESET} {sectionpath}\n")
# Loop through each article that contains an unknown character
for article in unknown_chars_articles:
# Print the title of the article in which there is unknown characters
# & the number of them
unknown_chars_apparitions: list[str] = article.get_unknown_chars()
nb: int = len(unknown_chars_apparitions)
s: str = "s" if nb > 1 else ""
@ -64,7 +100,8 @@ if __name__ == "__main__":
f"\n{BOLD}{nb}{RESET} unknown character{s} in {BOLD}{article.lang}{RESET} "
+ highlight_unknown_chars(article.title, R, RESET)
)
# Print the context in which the unknown characters are found
for text in unknown_chars_apparitions:
print(f" {BOLD}{RESET} " + highlight_unknown_chars(text, R, RESET))
db.close() # Close the database connection
db.close() # Close the connection with the database