iterate over sections, creating _index.md files. commenting. more context with unknown characters
This commit is contained in:
parent
d15ad5fd8e
commit
6740035958
241
spip2md/lib.py
241
spip2md/lib.py
@ -9,29 +9,92 @@ from database import SpipArticles, SpipAuteurs, SpipAuteursLiens, SpipRubriques
|
||||
|
||||
# from yaml import CDumper as Dumper
|
||||
|
||||
FILETYPE: str = "md"
|
||||
|
||||
class Article:
|
||||
def __init__(self, article):
|
||||
|
||||
class Item:
|
||||
id: int
|
||||
|
||||
def __init__(self, item) -> None:
|
||||
self.title: str = convert_meta(item.titre)
|
||||
self.section_id: int = item.id_rubrique
|
||||
self.description: str = convert_meta(item.descriptif)
|
||||
self.text: str = convert_body(item.texte) # Markdown
|
||||
self.publication: str = item.date
|
||||
self.draft: bool = item.statut == "publie"
|
||||
self.sector_id: int = item.id_secteur
|
||||
self.update: str = item.maj
|
||||
self.lang: str = item.lang
|
||||
self.set_lang: bool = item.langue_choisie # TODO Why ?
|
||||
self.translation_key: int = item.id_trad
|
||||
self.extra: str = item.extra # Probably unused
|
||||
|
||||
def get_slug(self, date: bool = False) -> str:
|
||||
return slugify(f"{self.publication if date else ''}-{self.title}")
|
||||
|
||||
def get_filename(self) -> str:
|
||||
return "index" + "." + self.lang + "." + FILETYPE
|
||||
|
||||
def get_frontmatter(self) -> str:
|
||||
return dump(
|
||||
{
|
||||
"lang": self.lang,
|
||||
"translationKey": self.translation_key,
|
||||
"title": self.title,
|
||||
"publishDate": self.publication,
|
||||
"lastmod": self.update,
|
||||
"draft": self.draft,
|
||||
"description": self.description,
|
||||
# Debugging
|
||||
"spip_id": self.id,
|
||||
"spip_id_secteur": self.sector_id,
|
||||
},
|
||||
allow_unicode=True,
|
||||
)
|
||||
|
||||
def get_content(self) -> str:
|
||||
# Build the final article text
|
||||
article: str = "---\n" + self.get_frontmatter() + "---"
|
||||
# If there is a caption, add the caption followed by a hr
|
||||
if hasattr(self, "caption") and len(self.caption) > 0:
|
||||
article += "\n\n" + self.caption + "\n\n***"
|
||||
# Add the title as a Markdown h1
|
||||
if len(self.title) > 0:
|
||||
article += "\n\n# " + self.title
|
||||
# If there is a text, add the text preceded by two line breaks
|
||||
if len(self.text) > 0:
|
||||
article += "\n\n" + self.text
|
||||
# Same with an "extra" section
|
||||
if self.extra is not None and len(self.extra) > 0:
|
||||
article += "\n\n# EXTRA\n\n" + self.extra
|
||||
# PS
|
||||
if hasattr(self, "ps") and len(self.ps) > 0:
|
||||
article += "\n\n# POST-SCRIPTUM\n\n" + self.ps
|
||||
# Microblog
|
||||
if hasattr(self, "microblog") and len(self.microblog) > 0:
|
||||
article += "\n\n# MICROBLOGGING\n\n" + self.microblog
|
||||
return article
|
||||
|
||||
def get_unknown_chars(self) -> list[str]:
|
||||
errors: list[str] = []
|
||||
for text in (self.title, self.text):
|
||||
for char in unknown_iso:
|
||||
for match in finditer(r".{0-20}" + char + r".*(?=\r?\n|$)", text):
|
||||
errors.append(match.group())
|
||||
return errors
|
||||
|
||||
|
||||
class Article(Item):
|
||||
def __init__(self, article) -> None:
|
||||
super().__init__(article)
|
||||
self.id: int = article.id_article
|
||||
self.surtitle: str = article.surtitre # Probably unused
|
||||
self.title: str = convert_meta(article.titre)
|
||||
self.subtitle: str = article.soustitre # Probably unused
|
||||
self.section_id: int = article.id_rubrique
|
||||
self.description: str = convert_meta(article.descriptif)
|
||||
self.caption: str = article.chapo # Probably unused
|
||||
self.text: str = convert_body(article.texte) # Markdown
|
||||
self.ps: str = article.ps # Probably unused
|
||||
self.publication: str = article.date
|
||||
self.draft: bool = False if article.statut == "publie" else True
|
||||
self.sector_id: int = article.id_secteur
|
||||
self.update: str = article.maj
|
||||
self.update_2: str = article.date_modif # Probably unused duplicate of maj
|
||||
self.creation: str = article.date_redac
|
||||
self.forum: bool = article.accepter_forum # TODO Why ?
|
||||
self.lang: str = article.lang
|
||||
self.set_lang: bool = article.langue_choisie # TODO Why ?
|
||||
self.translation_key: int = article.id_trad
|
||||
self.extra: str = article.extra # Probably unused
|
||||
self.sitename: str = article.nom_site # Probably useless
|
||||
self.virtual: str = article.virtuel # TODO Why ?
|
||||
self.microblog: str = article.microblog # Probably unused
|
||||
@ -41,19 +104,6 @@ class Article:
|
||||
# self.popularity: float = article.popularite # USELESS in static
|
||||
# self.version = article.id_version # USELESS
|
||||
|
||||
def get_section(self) -> str:
|
||||
return convert_meta(
|
||||
SpipRubriques.select()
|
||||
.where(SpipRubriques.id_rubrique == self.section_id)[0]
|
||||
.titre
|
||||
)
|
||||
|
||||
def get_path(self) -> str:
|
||||
return slugify(self.get_section()) + "/" + slugify(f"{self.title}") + "/"
|
||||
|
||||
def get_filename(self) -> str:
|
||||
return "index." + self.lang + ".md"
|
||||
|
||||
def get_authors(self) -> tuple:
|
||||
return (
|
||||
SpipAuteurs.select()
|
||||
@ -87,112 +137,79 @@ class Article:
|
||||
allow_unicode=True,
|
||||
)
|
||||
|
||||
def get_article(self) -> str:
|
||||
# Build the final article text
|
||||
article: str = "---\n" + self.get_frontmatter() + "---"
|
||||
# If there is a caption, add the caption followed by a hr
|
||||
if len(self.caption) > 0:
|
||||
article += "\n\n" + self.caption + "\n\n***"
|
||||
# Add the title as a Markdown h1
|
||||
if len(self.title) > 0:
|
||||
article += "\n\n# " + self.title
|
||||
# If there is a text, add the text preceded by two line breaks
|
||||
if len(self.text) > 0:
|
||||
article += "\n\n" + self.text
|
||||
# Same with an "extra" section
|
||||
if self.extra is not None and len(self.extra) > 0:
|
||||
article += "\n\n# EXTRA\n\n" + self.extra
|
||||
# PS
|
||||
if len(self.ps) > 0:
|
||||
article += "\n\n# POST-SCRIPTUM\n\n" + self.ps
|
||||
# Microblog
|
||||
if len(self.microblog) > 0:
|
||||
article += "\n\n# MICROBLOGGING\n\n" + self.microblog
|
||||
return article
|
||||
|
||||
def get_unknown_chars(self) -> list[str]:
|
||||
errors: list[str] = []
|
||||
for text in (self.title, self.text):
|
||||
for char in unknown_iso:
|
||||
for match in finditer(char + r".*(?=\r?\n|$)", text):
|
||||
errors.append(match.group())
|
||||
return errors
|
||||
|
||||
|
||||
class Section:
|
||||
class Section(Item):
|
||||
def __init__(self, section) -> None:
|
||||
super().__init__(section)
|
||||
self.id: int = section.id_rubrique
|
||||
self.parent_id: int = section.id_parent
|
||||
self.title: str = convert_meta(section.titre)
|
||||
self.description: str = convert_meta(section.descriptif)
|
||||
self.text: str = convert_body(section.texte) # Markdown
|
||||
self.sector_id: int = section.id_secteur
|
||||
self.update: str = section.maj
|
||||
self.publication: str = section.date
|
||||
self.draft: bool = False if section.statut == "publie" else True
|
||||
self.lang: str = section.lang
|
||||
self.lang_set: bool = False if section.langue_choisie == "oui" else True
|
||||
self.extra: str = section.extra # Probably unused
|
||||
self.translation_key: int = section.id_trad
|
||||
self.depth: int = section.profondeur
|
||||
self.agenda: int = section.agenda
|
||||
|
||||
def get_articles(self, limit: int):
|
||||
return Articles(limit)
|
||||
def get_articles(self, limit: int = 0):
|
||||
return Articles(self.id, limit)
|
||||
|
||||
|
||||
class Articles:
|
||||
exported: int = 0
|
||||
class LimitCounter:
|
||||
count: int
|
||||
LIMIT: int
|
||||
|
||||
def __init__(self, limit: int) -> None:
|
||||
# Query the DB to retrieve all articles sorted by publication date
|
||||
self.articles = (
|
||||
SpipArticles.select().order_by(SpipArticles.date.desc()).limit(limit)
|
||||
)
|
||||
self.toExport: int = len(self.articles)
|
||||
self.count = -1
|
||||
self.LIMIT = limit
|
||||
|
||||
def remaining(self):
|
||||
return self.toExport - self.exported
|
||||
def remaining(self) -> int:
|
||||
return self.LIMIT - self.count
|
||||
|
||||
def step(self) -> int:
|
||||
self.count += 1
|
||||
if self.remaining() <= 0:
|
||||
raise StopIteration
|
||||
return self.count
|
||||
|
||||
|
||||
class Items:
|
||||
items: list
|
||||
|
||||
def __init__(self) -> None:
|
||||
# Set a counter caped at the number of retrieved items
|
||||
self.count = LimitCounter(len(self.items))
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
if self.remaining() <= 0:
|
||||
raise StopIteration
|
||||
self.exported += 1
|
||||
article = Article(self.articles[self.exported - 1])
|
||||
return (
|
||||
{"exported": self.exported, "remaining": self.remaining()},
|
||||
article,
|
||||
)
|
||||
def __len__(self) -> int:
|
||||
return self.count.LIMIT
|
||||
|
||||
|
||||
class Sections:
|
||||
exported: int = 0
|
||||
|
||||
def __init__(self, limit: int = 0) -> None:
|
||||
class Articles(Items):
|
||||
def __init__(self, section_id: int, limit: int = 0) -> None:
|
||||
# Query the DB to retrieve all articles sorted by publication date
|
||||
if limit > 0:
|
||||
self.articles = (
|
||||
SpipArticles.select().order_by(SpipArticles.date.desc()).limit(limit)
|
||||
self.items = (
|
||||
SpipArticles.select()
|
||||
.where(SpipArticles.id_rubrique == section_id)
|
||||
.order_by(SpipArticles.date.desc())
|
||||
.limit(limit)
|
||||
)
|
||||
else:
|
||||
self.articles = SpipArticles.select().order_by(SpipArticles.date.desc())
|
||||
self.toExport: int = len(self.articles)
|
||||
|
||||
def remaining(self):
|
||||
return self.toExport - self.exported
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
self.items = SpipArticles.select().order_by(SpipArticles.date.desc())
|
||||
super().__init__()
|
||||
|
||||
def __next__(self):
|
||||
if self.remaining() <= 0:
|
||||
raise StopIteration
|
||||
self.exported += 1
|
||||
section = Section(self.articles[self.exported - 1])
|
||||
return (
|
||||
{"exported": self.exported, "remaining": self.remaining()},
|
||||
section,
|
||||
)
|
||||
return (Article(self.items[self.count.step()]), self.count)
|
||||
|
||||
|
||||
class Sections(Items):
|
||||
def __init__(self, limit: int = 0) -> None:
|
||||
# Query the DB to retrieve all sections sorted by publication date
|
||||
if limit > 0:
|
||||
self.items = (
|
||||
SpipRubriques.select().order_by(SpipRubriques.date.desc()).limit(limit)
|
||||
)
|
||||
else:
|
||||
self.items = SpipRubriques.select().order_by(SpipRubriques.date.desc())
|
||||
super().__init__()
|
||||
|
||||
def __next__(self):
|
||||
return (Section(self.items[self.count.step()]), self.count)
|
||||
|
@ -1,8 +1,8 @@
|
||||
#!python
|
||||
# pyright: strict
|
||||
import sys
|
||||
from os import makedirs, mkdir
|
||||
from shutil import rmtree
|
||||
from sys import argv
|
||||
|
||||
from config import config
|
||||
from converter import highlight_unknown_chars
|
||||
@ -20,14 +20,14 @@ RESET: str = "\033[0m"
|
||||
db.init(config.db, host=config.db_host, user=config.db_user, password=config.db_pass)
|
||||
db.connect()
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Define max nb of articles to export based on first CLI param
|
||||
if len(sys.argv) > 1:
|
||||
maxexport = int(sys.argv[1])
|
||||
if __name__ == "__main__": # Following is executed only if script is directly executed
|
||||
# Define max nb of articles to export based on first CLI argument
|
||||
if len(argv) > 1:
|
||||
maxexport = int(argv[1])
|
||||
else:
|
||||
maxexport = config.default_export_nb
|
||||
|
||||
# Clean the output dir & create a new
|
||||
# Clear the output dir & create a new
|
||||
rmtree(config.output_dir, True)
|
||||
mkdir(config.output_dir)
|
||||
|
||||
@ -35,28 +35,64 @@ if __name__ == "__main__":
|
||||
unknown_chars_articles: list[Article] = []
|
||||
|
||||
# Loop among first maxexport articles & export them
|
||||
for counter, section in Sections():
|
||||
for counter, article in section.get_articles(maxexport):
|
||||
if (counter["exported"] - 1) % 100 == 0:
|
||||
print(
|
||||
f"\n{BOLD}Exporting {R}{counter['remaining']+1}{RESET}"
|
||||
+ f"{BOLD} SPIP articles to Markdown & YAML files{RESET}\n"
|
||||
)
|
||||
empty: str = "EMPTY " if len(article.text) < 1 else ""
|
||||
for section, counter in Sections():
|
||||
# Print the name of the exported section & number of remaining sections
|
||||
print(
|
||||
f"{BOLD}{counter.count}. {RESET}"
|
||||
+ highlight_unknown_chars(section.title, R, RESET),
|
||||
end="",
|
||||
)
|
||||
if counter.remaining() > 2:
|
||||
print(
|
||||
f"{BOLD}{counter['exported']}. {empty}{RESET}"
|
||||
f" {BOLD}{R}{counter.remaining()-1}{RESET} {BOLD}sections left"
|
||||
+ RESET,
|
||||
)
|
||||
else:
|
||||
print()
|
||||
# Define the section’s path (directory) & create directory(ies) if needed
|
||||
sectiondir: str = config.output_dir + "/" + section.get_slug()
|
||||
makedirs(sectiondir, exist_ok=True)
|
||||
# Define the section filename & write the index at that filename
|
||||
sectionpath: str = sectiondir + "/" + section.get_filename()
|
||||
with open(sectionpath, "w") as f:
|
||||
f.write(section.get_content())
|
||||
# Loop over section’s articles
|
||||
articles = section.get_articles(maxexport)
|
||||
maxexport -= len(articles)
|
||||
for article, counter in articles:
|
||||
# Print the remaining number of articles to export every 100 articles
|
||||
if counter.count % 100 == 0:
|
||||
s: str = "s" if counter.remaining() > 1 else ""
|
||||
print(
|
||||
f" {BOLD}Exporting {R}{counter.remaining()}{RESET}"
|
||||
+ f"{BOLD} SPIP article{s}{RESET} to Markdown & YAML files\n"
|
||||
)
|
||||
# Print the title of the article being exported
|
||||
print(
|
||||
f" {BOLD}{counter.count + 1}. "
|
||||
+ ("EMPTY " if len(article.text) < 1 else "")
|
||||
+ RESET
|
||||
+ highlight_unknown_chars(article.title, R, RESET)
|
||||
)
|
||||
fullpath: str = config.output_dir + "/" + article.get_path()
|
||||
print(f"{BOLD}>{RESET} {fullpath}{article.get_filename()}")
|
||||
makedirs(fullpath, exist_ok=True)
|
||||
with open(fullpath + article.get_filename(), "w") as f:
|
||||
f.write(article.get_article())
|
||||
# Define the full article path & create directory(ies) if needed
|
||||
articledir: str = sectiondir + "/" + article.get_slug()
|
||||
makedirs(articledir, exist_ok=True)
|
||||
# Define the article filename & write the article at the filename
|
||||
articlepath: str = articledir + "/" + article.get_filename()
|
||||
with open(articlepath, "w") as f:
|
||||
f.write(article.get_content())
|
||||
# Store detected unknown characters
|
||||
if len(article.get_unknown_chars()) > 0:
|
||||
unknown_chars_articles.append(article)
|
||||
# Print the outputted file’s path when finished exporting the article
|
||||
print(f" {BOLD}Article>{RESET} {articlepath}")
|
||||
# Print the outputted file’s path when finished exporting the section
|
||||
print(f"\n{BOLD}Section>{RESET} {sectionpath}\n")
|
||||
|
||||
# Loop through each article that contains an unknown character
|
||||
for article in unknown_chars_articles:
|
||||
# Print the title of the article in which there is unknown characters
|
||||
# & the number of them
|
||||
unknown_chars_apparitions: list[str] = article.get_unknown_chars()
|
||||
nb: int = len(unknown_chars_apparitions)
|
||||
s: str = "s" if nb > 1 else ""
|
||||
@ -64,7 +100,8 @@ if __name__ == "__main__":
|
||||
f"\n{BOLD}{nb}{RESET} unknown character{s} in {BOLD}{article.lang}{RESET} "
|
||||
+ highlight_unknown_chars(article.title, R, RESET)
|
||||
)
|
||||
# Print the context in which the unknown characters are found
|
||||
for text in unknown_chars_apparitions:
|
||||
print(f" {BOLD}…{RESET} " + highlight_unknown_chars(text, R, RESET))
|
||||
|
||||
db.close() # Close the database connection
|
||||
db.close() # Close the connection with the database
|
||||
|
Loading…
Reference in New Issue
Block a user