iterate over sections, creating _index.md files. commenting. more context with unknown characters
This commit is contained in:
parent
d15ad5fd8e
commit
6740035958
241
spip2md/lib.py
241
spip2md/lib.py
@ -9,29 +9,92 @@ from database import SpipArticles, SpipAuteurs, SpipAuteursLiens, SpipRubriques
|
|||||||
|
|
||||||
# from yaml import CDumper as Dumper
|
# from yaml import CDumper as Dumper
|
||||||
|
|
||||||
|
FILETYPE: str = "md"
|
||||||
|
|
||||||
class Article:
|
|
||||||
def __init__(self, article):
|
class Item:
|
||||||
|
id: int
|
||||||
|
|
||||||
|
def __init__(self, item) -> None:
|
||||||
|
self.title: str = convert_meta(item.titre)
|
||||||
|
self.section_id: int = item.id_rubrique
|
||||||
|
self.description: str = convert_meta(item.descriptif)
|
||||||
|
self.text: str = convert_body(item.texte) # Markdown
|
||||||
|
self.publication: str = item.date
|
||||||
|
self.draft: bool = item.statut == "publie"
|
||||||
|
self.sector_id: int = item.id_secteur
|
||||||
|
self.update: str = item.maj
|
||||||
|
self.lang: str = item.lang
|
||||||
|
self.set_lang: bool = item.langue_choisie # TODO Why ?
|
||||||
|
self.translation_key: int = item.id_trad
|
||||||
|
self.extra: str = item.extra # Probably unused
|
||||||
|
|
||||||
|
def get_slug(self, date: bool = False) -> str:
|
||||||
|
return slugify(f"{self.publication if date else ''}-{self.title}")
|
||||||
|
|
||||||
|
def get_filename(self) -> str:
|
||||||
|
return "index" + "." + self.lang + "." + FILETYPE
|
||||||
|
|
||||||
|
def get_frontmatter(self) -> str:
|
||||||
|
return dump(
|
||||||
|
{
|
||||||
|
"lang": self.lang,
|
||||||
|
"translationKey": self.translation_key,
|
||||||
|
"title": self.title,
|
||||||
|
"publishDate": self.publication,
|
||||||
|
"lastmod": self.update,
|
||||||
|
"draft": self.draft,
|
||||||
|
"description": self.description,
|
||||||
|
# Debugging
|
||||||
|
"spip_id": self.id,
|
||||||
|
"spip_id_secteur": self.sector_id,
|
||||||
|
},
|
||||||
|
allow_unicode=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_content(self) -> str:
|
||||||
|
# Build the final article text
|
||||||
|
article: str = "---\n" + self.get_frontmatter() + "---"
|
||||||
|
# If there is a caption, add the caption followed by a hr
|
||||||
|
if hasattr(self, "caption") and len(self.caption) > 0:
|
||||||
|
article += "\n\n" + self.caption + "\n\n***"
|
||||||
|
# Add the title as a Markdown h1
|
||||||
|
if len(self.title) > 0:
|
||||||
|
article += "\n\n# " + self.title
|
||||||
|
# If there is a text, add the text preceded by two line breaks
|
||||||
|
if len(self.text) > 0:
|
||||||
|
article += "\n\n" + self.text
|
||||||
|
# Same with an "extra" section
|
||||||
|
if self.extra is not None and len(self.extra) > 0:
|
||||||
|
article += "\n\n# EXTRA\n\n" + self.extra
|
||||||
|
# PS
|
||||||
|
if hasattr(self, "ps") and len(self.ps) > 0:
|
||||||
|
article += "\n\n# POST-SCRIPTUM\n\n" + self.ps
|
||||||
|
# Microblog
|
||||||
|
if hasattr(self, "microblog") and len(self.microblog) > 0:
|
||||||
|
article += "\n\n# MICROBLOGGING\n\n" + self.microblog
|
||||||
|
return article
|
||||||
|
|
||||||
|
def get_unknown_chars(self) -> list[str]:
|
||||||
|
errors: list[str] = []
|
||||||
|
for text in (self.title, self.text):
|
||||||
|
for char in unknown_iso:
|
||||||
|
for match in finditer(r".{0-20}" + char + r".*(?=\r?\n|$)", text):
|
||||||
|
errors.append(match.group())
|
||||||
|
return errors
|
||||||
|
|
||||||
|
|
||||||
|
class Article(Item):
|
||||||
|
def __init__(self, article) -> None:
|
||||||
|
super().__init__(article)
|
||||||
self.id: int = article.id_article
|
self.id: int = article.id_article
|
||||||
self.surtitle: str = article.surtitre # Probably unused
|
self.surtitle: str = article.surtitre # Probably unused
|
||||||
self.title: str = convert_meta(article.titre)
|
|
||||||
self.subtitle: str = article.soustitre # Probably unused
|
self.subtitle: str = article.soustitre # Probably unused
|
||||||
self.section_id: int = article.id_rubrique
|
|
||||||
self.description: str = convert_meta(article.descriptif)
|
|
||||||
self.caption: str = article.chapo # Probably unused
|
self.caption: str = article.chapo # Probably unused
|
||||||
self.text: str = convert_body(article.texte) # Markdown
|
|
||||||
self.ps: str = article.ps # Probably unused
|
self.ps: str = article.ps # Probably unused
|
||||||
self.publication: str = article.date
|
|
||||||
self.draft: bool = False if article.statut == "publie" else True
|
|
||||||
self.sector_id: int = article.id_secteur
|
|
||||||
self.update: str = article.maj
|
|
||||||
self.update_2: str = article.date_modif # Probably unused duplicate of maj
|
self.update_2: str = article.date_modif # Probably unused duplicate of maj
|
||||||
self.creation: str = article.date_redac
|
self.creation: str = article.date_redac
|
||||||
self.forum: bool = article.accepter_forum # TODO Why ?
|
self.forum: bool = article.accepter_forum # TODO Why ?
|
||||||
self.lang: str = article.lang
|
|
||||||
self.set_lang: bool = article.langue_choisie # TODO Why ?
|
|
||||||
self.translation_key: int = article.id_trad
|
|
||||||
self.extra: str = article.extra # Probably unused
|
|
||||||
self.sitename: str = article.nom_site # Probably useless
|
self.sitename: str = article.nom_site # Probably useless
|
||||||
self.virtual: str = article.virtuel # TODO Why ?
|
self.virtual: str = article.virtuel # TODO Why ?
|
||||||
self.microblog: str = article.microblog # Probably unused
|
self.microblog: str = article.microblog # Probably unused
|
||||||
@ -41,19 +104,6 @@ class Article:
|
|||||||
# self.popularity: float = article.popularite # USELESS in static
|
# self.popularity: float = article.popularite # USELESS in static
|
||||||
# self.version = article.id_version # USELESS
|
# self.version = article.id_version # USELESS
|
||||||
|
|
||||||
def get_section(self) -> str:
|
|
||||||
return convert_meta(
|
|
||||||
SpipRubriques.select()
|
|
||||||
.where(SpipRubriques.id_rubrique == self.section_id)[0]
|
|
||||||
.titre
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_path(self) -> str:
|
|
||||||
return slugify(self.get_section()) + "/" + slugify(f"{self.title}") + "/"
|
|
||||||
|
|
||||||
def get_filename(self) -> str:
|
|
||||||
return "index." + self.lang + ".md"
|
|
||||||
|
|
||||||
def get_authors(self) -> tuple:
|
def get_authors(self) -> tuple:
|
||||||
return (
|
return (
|
||||||
SpipAuteurs.select()
|
SpipAuteurs.select()
|
||||||
@ -87,112 +137,79 @@ class Article:
|
|||||||
allow_unicode=True,
|
allow_unicode=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_article(self) -> str:
|
|
||||||
# Build the final article text
|
|
||||||
article: str = "---\n" + self.get_frontmatter() + "---"
|
|
||||||
# If there is a caption, add the caption followed by a hr
|
|
||||||
if len(self.caption) > 0:
|
|
||||||
article += "\n\n" + self.caption + "\n\n***"
|
|
||||||
# Add the title as a Markdown h1
|
|
||||||
if len(self.title) > 0:
|
|
||||||
article += "\n\n# " + self.title
|
|
||||||
# If there is a text, add the text preceded by two line breaks
|
|
||||||
if len(self.text) > 0:
|
|
||||||
article += "\n\n" + self.text
|
|
||||||
# Same with an "extra" section
|
|
||||||
if self.extra is not None and len(self.extra) > 0:
|
|
||||||
article += "\n\n# EXTRA\n\n" + self.extra
|
|
||||||
# PS
|
|
||||||
if len(self.ps) > 0:
|
|
||||||
article += "\n\n# POST-SCRIPTUM\n\n" + self.ps
|
|
||||||
# Microblog
|
|
||||||
if len(self.microblog) > 0:
|
|
||||||
article += "\n\n# MICROBLOGGING\n\n" + self.microblog
|
|
||||||
return article
|
|
||||||
|
|
||||||
def get_unknown_chars(self) -> list[str]:
|
class Section(Item):
|
||||||
errors: list[str] = []
|
|
||||||
for text in (self.title, self.text):
|
|
||||||
for char in unknown_iso:
|
|
||||||
for match in finditer(char + r".*(?=\r?\n|$)", text):
|
|
||||||
errors.append(match.group())
|
|
||||||
return errors
|
|
||||||
|
|
||||||
|
|
||||||
class Section:
|
|
||||||
def __init__(self, section) -> None:
|
def __init__(self, section) -> None:
|
||||||
|
super().__init__(section)
|
||||||
self.id: int = section.id_rubrique
|
self.id: int = section.id_rubrique
|
||||||
self.parent_id: int = section.id_parent
|
self.parent_id: int = section.id_parent
|
||||||
self.title: str = convert_meta(section.titre)
|
|
||||||
self.description: str = convert_meta(section.descriptif)
|
|
||||||
self.text: str = convert_body(section.texte) # Markdown
|
|
||||||
self.sector_id: int = section.id_secteur
|
|
||||||
self.update: str = section.maj
|
|
||||||
self.publication: str = section.date
|
|
||||||
self.draft: bool = False if section.statut == "publie" else True
|
|
||||||
self.lang: str = section.lang
|
|
||||||
self.lang_set: bool = False if section.langue_choisie == "oui" else True
|
|
||||||
self.extra: str = section.extra # Probably unused
|
|
||||||
self.translation_key: int = section.id_trad
|
|
||||||
self.depth: int = section.profondeur
|
self.depth: int = section.profondeur
|
||||||
self.agenda: int = section.agenda
|
self.agenda: int = section.agenda
|
||||||
|
|
||||||
def get_articles(self, limit: int):
|
def get_articles(self, limit: int = 0):
|
||||||
return Articles(limit)
|
return Articles(self.id, limit)
|
||||||
|
|
||||||
|
|
||||||
class Articles:
|
class LimitCounter:
|
||||||
exported: int = 0
|
count: int
|
||||||
|
LIMIT: int
|
||||||
|
|
||||||
def __init__(self, limit: int) -> None:
|
def __init__(self, limit: int) -> None:
|
||||||
# Query the DB to retrieve all articles sorted by publication date
|
self.count = -1
|
||||||
self.articles = (
|
self.LIMIT = limit
|
||||||
SpipArticles.select().order_by(SpipArticles.date.desc()).limit(limit)
|
|
||||||
)
|
|
||||||
self.toExport: int = len(self.articles)
|
|
||||||
|
|
||||||
def remaining(self):
|
def remaining(self) -> int:
|
||||||
return self.toExport - self.exported
|
return self.LIMIT - self.count
|
||||||
|
|
||||||
|
def step(self) -> int:
|
||||||
|
self.count += 1
|
||||||
|
if self.remaining() <= 0:
|
||||||
|
raise StopIteration
|
||||||
|
return self.count
|
||||||
|
|
||||||
|
|
||||||
|
class Items:
|
||||||
|
items: list
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
# Set a counter caped at the number of retrieved items
|
||||||
|
self.count = LimitCounter(len(self.items))
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __next__(self):
|
def __len__(self) -> int:
|
||||||
if self.remaining() <= 0:
|
return self.count.LIMIT
|
||||||
raise StopIteration
|
|
||||||
self.exported += 1
|
|
||||||
article = Article(self.articles[self.exported - 1])
|
|
||||||
return (
|
|
||||||
{"exported": self.exported, "remaining": self.remaining()},
|
|
||||||
article,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class Sections:
|
class Articles(Items):
|
||||||
exported: int = 0
|
def __init__(self, section_id: int, limit: int = 0) -> None:
|
||||||
|
|
||||||
def __init__(self, limit: int = 0) -> None:
|
|
||||||
# Query the DB to retrieve all articles sorted by publication date
|
# Query the DB to retrieve all articles sorted by publication date
|
||||||
if limit > 0:
|
if limit > 0:
|
||||||
self.articles = (
|
self.items = (
|
||||||
SpipArticles.select().order_by(SpipArticles.date.desc()).limit(limit)
|
SpipArticles.select()
|
||||||
|
.where(SpipArticles.id_rubrique == section_id)
|
||||||
|
.order_by(SpipArticles.date.desc())
|
||||||
|
.limit(limit)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.articles = SpipArticles.select().order_by(SpipArticles.date.desc())
|
self.items = SpipArticles.select().order_by(SpipArticles.date.desc())
|
||||||
self.toExport: int = len(self.articles)
|
super().__init__()
|
||||||
|
|
||||||
def remaining(self):
|
|
||||||
return self.toExport - self.exported
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __next__(self):
|
def __next__(self):
|
||||||
if self.remaining() <= 0:
|
return (Article(self.items[self.count.step()]), self.count)
|
||||||
raise StopIteration
|
|
||||||
self.exported += 1
|
|
||||||
section = Section(self.articles[self.exported - 1])
|
class Sections(Items):
|
||||||
return (
|
def __init__(self, limit: int = 0) -> None:
|
||||||
{"exported": self.exported, "remaining": self.remaining()},
|
# Query the DB to retrieve all sections sorted by publication date
|
||||||
section,
|
if limit > 0:
|
||||||
)
|
self.items = (
|
||||||
|
SpipRubriques.select().order_by(SpipRubriques.date.desc()).limit(limit)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.items = SpipRubriques.select().order_by(SpipRubriques.date.desc())
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
return (Section(self.items[self.count.step()]), self.count)
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
#!python
|
#!python
|
||||||
# pyright: strict
|
# pyright: strict
|
||||||
import sys
|
|
||||||
from os import makedirs, mkdir
|
from os import makedirs, mkdir
|
||||||
from shutil import rmtree
|
from shutil import rmtree
|
||||||
|
from sys import argv
|
||||||
|
|
||||||
from config import config
|
from config import config
|
||||||
from converter import highlight_unknown_chars
|
from converter import highlight_unknown_chars
|
||||||
@ -20,14 +20,14 @@ RESET: str = "\033[0m"
|
|||||||
db.init(config.db, host=config.db_host, user=config.db_user, password=config.db_pass)
|
db.init(config.db, host=config.db_host, user=config.db_user, password=config.db_pass)
|
||||||
db.connect()
|
db.connect()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__": # Following is executed only if script is directly executed
|
||||||
# Define max nb of articles to export based on first CLI param
|
# Define max nb of articles to export based on first CLI argument
|
||||||
if len(sys.argv) > 1:
|
if len(argv) > 1:
|
||||||
maxexport = int(sys.argv[1])
|
maxexport = int(argv[1])
|
||||||
else:
|
else:
|
||||||
maxexport = config.default_export_nb
|
maxexport = config.default_export_nb
|
||||||
|
|
||||||
# Clean the output dir & create a new
|
# Clear the output dir & create a new
|
||||||
rmtree(config.output_dir, True)
|
rmtree(config.output_dir, True)
|
||||||
mkdir(config.output_dir)
|
mkdir(config.output_dir)
|
||||||
|
|
||||||
@ -35,28 +35,64 @@ if __name__ == "__main__":
|
|||||||
unknown_chars_articles: list[Article] = []
|
unknown_chars_articles: list[Article] = []
|
||||||
|
|
||||||
# Loop among first maxexport articles & export them
|
# Loop among first maxexport articles & export them
|
||||||
for counter, section in Sections():
|
for section, counter in Sections():
|
||||||
for counter, article in section.get_articles(maxexport):
|
# Print the name of the exported section & number of remaining sections
|
||||||
if (counter["exported"] - 1) % 100 == 0:
|
print(
|
||||||
print(
|
f"{BOLD}{counter.count}. {RESET}"
|
||||||
f"\n{BOLD}Exporting {R}{counter['remaining']+1}{RESET}"
|
+ highlight_unknown_chars(section.title, R, RESET),
|
||||||
+ f"{BOLD} SPIP articles to Markdown & YAML files{RESET}\n"
|
end="",
|
||||||
)
|
)
|
||||||
empty: str = "EMPTY " if len(article.text) < 1 else ""
|
if counter.remaining() > 2:
|
||||||
print(
|
print(
|
||||||
f"{BOLD}{counter['exported']}. {empty}{RESET}"
|
f" {BOLD}{R}{counter.remaining()-1}{RESET} {BOLD}sections left"
|
||||||
|
+ RESET,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print()
|
||||||
|
# Define the section’s path (directory) & create directory(ies) if needed
|
||||||
|
sectiondir: str = config.output_dir + "/" + section.get_slug()
|
||||||
|
makedirs(sectiondir, exist_ok=True)
|
||||||
|
# Define the section filename & write the index at that filename
|
||||||
|
sectionpath: str = sectiondir + "/" + section.get_filename()
|
||||||
|
with open(sectionpath, "w") as f:
|
||||||
|
f.write(section.get_content())
|
||||||
|
# Loop over section’s articles
|
||||||
|
articles = section.get_articles(maxexport)
|
||||||
|
maxexport -= len(articles)
|
||||||
|
for article, counter in articles:
|
||||||
|
# Print the remaining number of articles to export every 100 articles
|
||||||
|
if counter.count % 100 == 0:
|
||||||
|
s: str = "s" if counter.remaining() > 1 else ""
|
||||||
|
print(
|
||||||
|
f" {BOLD}Exporting {R}{counter.remaining()}{RESET}"
|
||||||
|
+ f"{BOLD} SPIP article{s}{RESET} to Markdown & YAML files\n"
|
||||||
|
)
|
||||||
|
# Print the title of the article being exported
|
||||||
|
print(
|
||||||
|
f" {BOLD}{counter.count + 1}. "
|
||||||
|
+ ("EMPTY " if len(article.text) < 1 else "")
|
||||||
|
+ RESET
|
||||||
+ highlight_unknown_chars(article.title, R, RESET)
|
+ highlight_unknown_chars(article.title, R, RESET)
|
||||||
)
|
)
|
||||||
fullpath: str = config.output_dir + "/" + article.get_path()
|
# Define the full article path & create directory(ies) if needed
|
||||||
print(f"{BOLD}>{RESET} {fullpath}{article.get_filename()}")
|
articledir: str = sectiondir + "/" + article.get_slug()
|
||||||
makedirs(fullpath, exist_ok=True)
|
makedirs(articledir, exist_ok=True)
|
||||||
with open(fullpath + article.get_filename(), "w") as f:
|
# Define the article filename & write the article at the filename
|
||||||
f.write(article.get_article())
|
articlepath: str = articledir + "/" + article.get_filename()
|
||||||
|
with open(articlepath, "w") as f:
|
||||||
|
f.write(article.get_content())
|
||||||
# Store detected unknown characters
|
# Store detected unknown characters
|
||||||
if len(article.get_unknown_chars()) > 0:
|
if len(article.get_unknown_chars()) > 0:
|
||||||
unknown_chars_articles.append(article)
|
unknown_chars_articles.append(article)
|
||||||
|
# Print the outputted file’s path when finished exporting the article
|
||||||
|
print(f" {BOLD}Article>{RESET} {articlepath}")
|
||||||
|
# Print the outputted file’s path when finished exporting the section
|
||||||
|
print(f"\n{BOLD}Section>{RESET} {sectionpath}\n")
|
||||||
|
|
||||||
|
# Loop through each article that contains an unknown character
|
||||||
for article in unknown_chars_articles:
|
for article in unknown_chars_articles:
|
||||||
|
# Print the title of the article in which there is unknown characters
|
||||||
|
# & the number of them
|
||||||
unknown_chars_apparitions: list[str] = article.get_unknown_chars()
|
unknown_chars_apparitions: list[str] = article.get_unknown_chars()
|
||||||
nb: int = len(unknown_chars_apparitions)
|
nb: int = len(unknown_chars_apparitions)
|
||||||
s: str = "s" if nb > 1 else ""
|
s: str = "s" if nb > 1 else ""
|
||||||
@ -64,7 +100,8 @@ if __name__ == "__main__":
|
|||||||
f"\n{BOLD}{nb}{RESET} unknown character{s} in {BOLD}{article.lang}{RESET} "
|
f"\n{BOLD}{nb}{RESET} unknown character{s} in {BOLD}{article.lang}{RESET} "
|
||||||
+ highlight_unknown_chars(article.title, R, RESET)
|
+ highlight_unknown_chars(article.title, R, RESET)
|
||||||
)
|
)
|
||||||
|
# Print the context in which the unknown characters are found
|
||||||
for text in unknown_chars_apparitions:
|
for text in unknown_chars_apparitions:
|
||||||
print(f" {BOLD}…{RESET} " + highlight_unknown_chars(text, R, RESET))
|
print(f" {BOLD}…{RESET} " + highlight_unknown_chars(text, R, RESET))
|
||||||
|
|
||||||
db.close() # Close the database connection
|
db.close() # Close the connection with the database
|
||||||
|
Loading…
Reference in New Issue
Block a user