iterate over sections, creating _index.md files. commenting. more context with unknown characters

This commit is contained in:
Guilhem Fauré 2023-05-17 14:29:57 +02:00
parent d15ad5fd8e
commit 6740035958
2 changed files with 187 additions and 133 deletions

View File

@ -9,29 +9,92 @@ from database import SpipArticles, SpipAuteurs, SpipAuteursLiens, SpipRubriques
# from yaml import CDumper as Dumper # from yaml import CDumper as Dumper
FILETYPE: str = "md"
class Article:
def __init__(self, article): class Item:
id: int
def __init__(self, item) -> None:
self.title: str = convert_meta(item.titre)
self.section_id: int = item.id_rubrique
self.description: str = convert_meta(item.descriptif)
self.text: str = convert_body(item.texte) # Markdown
self.publication: str = item.date
self.draft: bool = item.statut == "publie"
self.sector_id: int = item.id_secteur
self.update: str = item.maj
self.lang: str = item.lang
self.set_lang: bool = item.langue_choisie # TODO Why?
self.translation_key: int = item.id_trad
self.extra: str = item.extra # Probably unused
def get_slug(self, date: bool = False) -> str:
return slugify(f"{self.publication if date else ''}-{self.title}")
def get_filename(self) -> str:
return "index" + "." + self.lang + "." + FILETYPE
def get_frontmatter(self) -> str:
return dump(
{
"lang": self.lang,
"translationKey": self.translation_key,
"title": self.title,
"publishDate": self.publication,
"lastmod": self.update,
"draft": self.draft,
"description": self.description,
# Debugging
"spip_id": self.id,
"spip_id_secteur": self.sector_id,
},
allow_unicode=True,
)
def get_content(self) -> str:
# Build the final article text
article: str = "---\n" + self.get_frontmatter() + "---"
# If there is a caption, add the caption followed by a hr
if hasattr(self, "caption") and len(self.caption) > 0:
article += "\n\n" + self.caption + "\n\n***"
# Add the title as a Markdown h1
if len(self.title) > 0:
article += "\n\n# " + self.title
# If there is a text, add the text preceded by two line breaks
if len(self.text) > 0:
article += "\n\n" + self.text
# Same with an "extra" section
if self.extra is not None and len(self.extra) > 0:
article += "\n\n# EXTRA\n\n" + self.extra
# PS
if hasattr(self, "ps") and len(self.ps) > 0:
article += "\n\n# POST-SCRIPTUM\n\n" + self.ps
# Microblog
if hasattr(self, "microblog") and len(self.microblog) > 0:
article += "\n\n# MICROBLOGGING\n\n" + self.microblog
return article
def get_unknown_chars(self) -> list[str]:
errors: list[str] = []
for text in (self.title, self.text):
for char in unknown_iso:
for match in finditer(r".{0-20}" + char + r".*(?=\r?\n|$)", text):
errors.append(match.group())
return errors
class Article(Item):
def __init__(self, article) -> None:
super().__init__(article)
self.id: int = article.id_article self.id: int = article.id_article
self.surtitle: str = article.surtitre # Probably unused self.surtitle: str = article.surtitre # Probably unused
self.title: str = convert_meta(article.titre)
self.subtitle: str = article.soustitre # Probably unused self.subtitle: str = article.soustitre # Probably unused
self.section_id: int = article.id_rubrique
self.description: str = convert_meta(article.descriptif)
self.caption: str = article.chapo # Probably unused self.caption: str = article.chapo # Probably unused
self.text: str = convert_body(article.texte) # Markdown
self.ps: str = article.ps # Probably unused self.ps: str = article.ps # Probably unused
self.publication: str = article.date
self.draft: bool = False if article.statut == "publie" else True
self.sector_id: int = article.id_secteur
self.update: str = article.maj
self.update_2: str = article.date_modif # Probably unused duplicate of maj self.update_2: str = article.date_modif # Probably unused duplicate of maj
self.creation: str = article.date_redac self.creation: str = article.date_redac
self.forum: bool = article.accepter_forum # TODO Why? self.forum: bool = article.accepter_forum # TODO Why?
self.lang: str = article.lang
self.set_lang: bool = article.langue_choisie # TODO Why?
self.translation_key: int = article.id_trad
self.extra: str = article.extra # Probably unused
self.sitename: str = article.nom_site # Probably useless self.sitename: str = article.nom_site # Probably useless
self.virtual: str = article.virtuel # TODO Why? self.virtual: str = article.virtuel # TODO Why?
self.microblog: str = article.microblog # Probably unused self.microblog: str = article.microblog # Probably unused
@ -41,19 +104,6 @@ class Article:
# self.popularity: float = article.popularite # USELESS in static # self.popularity: float = article.popularite # USELESS in static
# self.version = article.id_version # USELESS # self.version = article.id_version # USELESS
def get_section(self) -> str:
return convert_meta(
SpipRubriques.select()
.where(SpipRubriques.id_rubrique == self.section_id)[0]
.titre
)
def get_path(self) -> str:
return slugify(self.get_section()) + "/" + slugify(f"{self.title}") + "/"
def get_filename(self) -> str:
return "index." + self.lang + ".md"
def get_authors(self) -> tuple: def get_authors(self) -> tuple:
return ( return (
SpipAuteurs.select() SpipAuteurs.select()
@ -87,112 +137,79 @@ class Article:
allow_unicode=True, allow_unicode=True,
) )
def get_article(self) -> str:
# Build the final article text
article: str = "---\n" + self.get_frontmatter() + "---"
# If there is a caption, add the caption followed by a hr
if len(self.caption) > 0:
article += "\n\n" + self.caption + "\n\n***"
# Add the title as a Markdown h1
if len(self.title) > 0:
article += "\n\n# " + self.title
# If there is a text, add the text preceded by two line breaks
if len(self.text) > 0:
article += "\n\n" + self.text
# Same with an "extra" section
if self.extra is not None and len(self.extra) > 0:
article += "\n\n# EXTRA\n\n" + self.extra
# PS
if len(self.ps) > 0:
article += "\n\n# POST-SCRIPTUM\n\n" + self.ps
# Microblog
if len(self.microblog) > 0:
article += "\n\n# MICROBLOGGING\n\n" + self.microblog
return article
def get_unknown_chars(self) -> list[str]: class Section(Item):
errors: list[str] = []
for text in (self.title, self.text):
for char in unknown_iso:
for match in finditer(char + r".*(?=\r?\n|$)", text):
errors.append(match.group())
return errors
class Section:
def __init__(self, section) -> None: def __init__(self, section) -> None:
super().__init__(section)
self.id: int = section.id_rubrique self.id: int = section.id_rubrique
self.parent_id: int = section.id_parent self.parent_id: int = section.id_parent
self.title: str = convert_meta(section.titre)
self.description: str = convert_meta(section.descriptif)
self.text: str = convert_body(section.texte) # Markdown
self.sector_id: int = section.id_secteur
self.update: str = section.maj
self.publication: str = section.date
self.draft: bool = False if section.statut == "publie" else True
self.lang: str = section.lang
self.lang_set: bool = False if section.langue_choisie == "oui" else True
self.extra: str = section.extra # Probably unused
self.translation_key: int = section.id_trad
self.depth: int = section.profondeur self.depth: int = section.profondeur
self.agenda: int = section.agenda self.agenda: int = section.agenda
def get_articles(self, limit: int): def get_articles(self, limit: int = 0):
return Articles(limit) return Articles(self.id, limit)
class Articles: class LimitCounter:
exported: int = 0 count: int
LIMIT: int
def __init__(self, limit: int) -> None: def __init__(self, limit: int) -> None:
# Query the DB to retrieve all articles sorted by publication date self.count = -1
self.articles = ( self.LIMIT = limit
SpipArticles.select().order_by(SpipArticles.date.desc()).limit(limit)
)
self.toExport: int = len(self.articles)
def remaining(self): def remaining(self) -> int:
return self.toExport - self.exported return self.LIMIT - self.count
def step(self) -> int:
self.count += 1
if self.remaining() <= 0:
raise StopIteration
return self.count
class Items:
items: list
def __init__(self) -> None:
# Set a counter caped at the number of retrieved items
self.count = LimitCounter(len(self.items))
def __iter__(self): def __iter__(self):
return self return self
def __next__(self): def __len__(self) -> int:
if self.remaining() <= 0: return self.count.LIMIT
raise StopIteration
self.exported += 1
article = Article(self.articles[self.exported - 1])
return (
{"exported": self.exported, "remaining": self.remaining()},
article,
)
class Sections: class Articles(Items):
exported: int = 0 def __init__(self, section_id: int, limit: int = 0) -> None:
def __init__(self, limit: int = 0) -> None:
# Query the DB to retrieve all articles sorted by publication date # Query the DB to retrieve all articles sorted by publication date
if limit > 0: if limit > 0:
self.articles = ( self.items = (
SpipArticles.select().order_by(SpipArticles.date.desc()).limit(limit) SpipArticles.select()
.where(SpipArticles.id_rubrique == section_id)
.order_by(SpipArticles.date.desc())
.limit(limit)
) )
else: else:
self.articles = SpipArticles.select().order_by(SpipArticles.date.desc()) self.items = SpipArticles.select().order_by(SpipArticles.date.desc())
self.toExport: int = len(self.articles) super().__init__()
def remaining(self):
return self.toExport - self.exported
def __iter__(self):
return self
def __next__(self): def __next__(self):
if self.remaining() <= 0: return (Article(self.items[self.count.step()]), self.count)
raise StopIteration
self.exported += 1
section = Section(self.articles[self.exported - 1]) class Sections(Items):
return ( def __init__(self, limit: int = 0) -> None:
{"exported": self.exported, "remaining": self.remaining()}, # Query the DB to retrieve all sections sorted by publication date
section, if limit > 0:
self.items = (
SpipRubriques.select().order_by(SpipRubriques.date.desc()).limit(limit)
) )
else:
self.items = SpipRubriques.select().order_by(SpipRubriques.date.desc())
super().__init__()
def __next__(self):
return (Section(self.items[self.count.step()]), self.count)

View File

@ -1,8 +1,8 @@
#!python #!python
# pyright: strict # pyright: strict
import sys
from os import makedirs, mkdir from os import makedirs, mkdir
from shutil import rmtree from shutil import rmtree
from sys import argv
from config import config from config import config
from converter import highlight_unknown_chars from converter import highlight_unknown_chars
@ -20,14 +20,14 @@ RESET: str = "\033[0m"
db.init(config.db, host=config.db_host, user=config.db_user, password=config.db_pass) db.init(config.db, host=config.db_host, user=config.db_user, password=config.db_pass)
db.connect() db.connect()
if __name__ == "__main__": if __name__ == "__main__": # Following is executed only if script is directly executed
# Define max nb of articles to export based on first CLI param # Define max nb of articles to export based on first CLI argument
if len(sys.argv) > 1: if len(argv) > 1:
maxexport = int(sys.argv[1]) maxexport = int(argv[1])
else: else:
maxexport = config.default_export_nb maxexport = config.default_export_nb
# Clean the output dir & create a new # Clear the output dir & create a new
rmtree(config.output_dir, True) rmtree(config.output_dir, True)
mkdir(config.output_dir) mkdir(config.output_dir)
@ -35,28 +35,64 @@ if __name__ == "__main__":
unknown_chars_articles: list[Article] = [] unknown_chars_articles: list[Article] = []
# Loop among first maxexport articles & export them # Loop among first maxexport articles & export them
for counter, section in Sections(): for section, counter in Sections():
for counter, article in section.get_articles(maxexport): # Print the name of the exported section & number of remaining sections
if (counter["exported"] - 1) % 100 == 0:
print( print(
f"\n{BOLD}Exporting {R}{counter['remaining']+1}{RESET}" f"{BOLD}{counter.count}. {RESET}"
+ f"{BOLD} SPIP articles to Markdown & YAML files{RESET}\n" + highlight_unknown_chars(section.title, R, RESET),
end="",
) )
empty: str = "EMPTY " if len(article.text) < 1 else "" if counter.remaining() > 2:
print( print(
f"{BOLD}{counter['exported']}. {empty}{RESET}" f" {BOLD}{R}{counter.remaining()-1}{RESET} {BOLD}sections left"
+ RESET,
)
else:
print()
# Define the sections path (directory) & create directory(ies) if needed
sectiondir: str = config.output_dir + "/" + section.get_slug()
makedirs(sectiondir, exist_ok=True)
# Define the section filename & write the index at that filename
sectionpath: str = sectiondir + "/" + section.get_filename()
with open(sectionpath, "w") as f:
f.write(section.get_content())
# Loop over sections articles
articles = section.get_articles(maxexport)
maxexport -= len(articles)
for article, counter in articles:
# Print the remaining number of articles to export every 100 articles
if counter.count % 100 == 0:
s: str = "s" if counter.remaining() > 1 else ""
print(
f" {BOLD}Exporting {R}{counter.remaining()}{RESET}"
+ f"{BOLD} SPIP article{s}{RESET} to Markdown & YAML files\n"
)
# Print the title of the article being exported
print(
f" {BOLD}{counter.count + 1}. "
+ ("EMPTY " if len(article.text) < 1 else "")
+ RESET
+ highlight_unknown_chars(article.title, R, RESET) + highlight_unknown_chars(article.title, R, RESET)
) )
fullpath: str = config.output_dir + "/" + article.get_path() # Define the full article path & create directory(ies) if needed
print(f"{BOLD}>{RESET} {fullpath}{article.get_filename()}") articledir: str = sectiondir + "/" + article.get_slug()
makedirs(fullpath, exist_ok=True) makedirs(articledir, exist_ok=True)
with open(fullpath + article.get_filename(), "w") as f: # Define the article filename & write the article at the filename
f.write(article.get_article()) articlepath: str = articledir + "/" + article.get_filename()
with open(articlepath, "w") as f:
f.write(article.get_content())
# Store detected unknown characters # Store detected unknown characters
if len(article.get_unknown_chars()) > 0: if len(article.get_unknown_chars()) > 0:
unknown_chars_articles.append(article) unknown_chars_articles.append(article)
# Print the outputted files path when finished exporting the article
print(f" {BOLD}Article>{RESET} {articlepath}")
# Print the outputted files path when finished exporting the section
print(f"\n{BOLD}Section>{RESET} {sectionpath}\n")
# Loop through each article that contains an unknown character
for article in unknown_chars_articles: for article in unknown_chars_articles:
# Print the title of the article in which there is unknown characters
# & the number of them
unknown_chars_apparitions: list[str] = article.get_unknown_chars() unknown_chars_apparitions: list[str] = article.get_unknown_chars()
nb: int = len(unknown_chars_apparitions) nb: int = len(unknown_chars_apparitions)
s: str = "s" if nb > 1 else "" s: str = "s" if nb > 1 else ""
@ -64,7 +100,8 @@ if __name__ == "__main__":
f"\n{BOLD}{nb}{RESET} unknown character{s} in {BOLD}{article.lang}{RESET} " f"\n{BOLD}{nb}{RESET} unknown character{s} in {BOLD}{article.lang}{RESET} "
+ highlight_unknown_chars(article.title, R, RESET) + highlight_unknown_chars(article.title, R, RESET)
) )
# Print the context in which the unknown characters are found
for text in unknown_chars_apparitions: for text in unknown_chars_apparitions:
print(f" {BOLD}{RESET} " + highlight_unknown_chars(text, R, RESET)) print(f" {BOLD}{RESET} " + highlight_unknown_chars(text, R, RESET))
db.close() # Close the database connection db.close() # Close the connection with the database