start refactor to make section tree exporting recursive to be able to handle the whole depth of SPIP section tree + code deduplication

This commit is contained in:
Guilhem Fauré 2023-05-25 13:52:00 +02:00
parent 7e3680d282
commit f06d09d338
4 changed files with 203 additions and 233 deletions

View File

@ -1,5 +1,5 @@
# pyright: strict
from os.path import isfile
from os.path import expanduser, isfile
from typing import Optional
from yaml import Loader, load
@ -18,19 +18,29 @@ class Configuration:
db_host: str = "localhost"
db_user: str = "spip"
db_pass: str = "password"
output_dir: str = "output"
max_articles_export: int = 1000
max_sections_export: int = 500
data_dir: str = "data"
output_dir: str = "output/"
data_dir: str = "data/"
clear_output: bool = False
prepend_h1: bool = True
export_filetype: str = "md"
max_articles_export: int = 1000 # TODO reimplement with recursion
max_sections_export: int = 500 # TODO reimplement with recursion
def __init__(self, config_file: Optional[str] = None):
if config_file is not None:
# Read config from config file
with open(config_file) as f:
config = load(f.read(), Loader=Loader)
# Assign configuration for each attribute in config file
for attr in config:
# If attribute is a dir, ensure that ~ is converted to home path
if type(attr) == "string" and "dir" in attr:
directory = expanduser(config[attr])
# Ensure that directory ends with a slash
directory = (
directory if directory.last() == "/" else directory + "/"
)
setattr(self, attr, directory)
setattr(self, attr, config[attr])

View File

@ -1,151 +1,30 @@
#!python
from os import makedirs
from os.path import expanduser
from shutil import copyfile, rmtree
from shutil import rmtree
from sys import argv
from peewee import ModelSelect
from config import config
from converters import unknown_chars, unknown_chars_context
from database import DB
from spipobjects import (
Article,
Document,
Rubrique,
get_articles,
get_sections,
)
# Define styles
BO = 1 # Bold
IT = 3 # Italic
UN = 4 # Underline
# Define colors
R = 91 # Red
G = 92 # Green
Y = 93 # Yellow
B = 94 # Blue
C0 = 95 # Color
C1 = 96 # Color
C2 = 96 # Color
from styling import highlight, style
# Print a stylized string, without trailing newline
def style(string: str, *args: int) -> None:
esc = "\033[" # Terminal escape sequence, needs to be closed by "m"
if len(args) == 0:
params: str = "1;" # Defaults to bold
else:
params: str = ""
for a in args:
params += str(a) + ";"
print(esc + params[:-1] + "m" + string + esc + "0m", end="")
# Print a string, highlighting every substring starting at start_stop[x][0] …
def highlight(string: str, *start_stop: tuple[int, int]) -> None:
previous_stop = 0
for start, stop in start_stop:
print(string[previous_stop:start], end="")
style(string[start:stop], BO, R)
previous_stop = stop
print(string[previous_stop:], end="")
# Plural ?
def s(nb: int) -> str:
return "s" if nb > 1 else ""
# Indent with spaces
def indent(nb: int = 1) -> None:
for _ in range(nb):
print(" ", end="")
# Output information about ongoing export & write section to output destination
def write_section(index: int, total: int, section: Rubrique) -> str:
color = G # Associate sections to green
# Print the name of the exported section & number of remaining sections
style(f"{index + 1}. ", BO)
highlight(section.titre, *unknown_chars(section.titre))
style(f" {total-index-1}", BO, color)
style(f" section{s(total-index)} left")
# Define the sections path (directory) & create directory(ies) if needed
sectiondir: str = config.output_dir + "/" + section.slug()
makedirs(sectiondir, exist_ok=True)
# Define the section filename & write the index at that filename
sectionpath: str = sectiondir + "/" + section.filename()
with open(sectionpath, "w") as f:
f.write(section.content())
# Print export location when finished exporting
style(" -> ", BO, color)
print(sectionpath)
# Return the first "limit" articles of section
return sectiondir
# Output information about ongoing export & write article to output destination
def write_article(index: int, total: int, article: Article, sectiondir: str) -> str:
color = Y # Associate articles to yellow
# Print the remaining number of articles to export every 100 articles
if index % 100 == 0:
indent()
print("Exporting", end="")
style(f" {total-index}", BO, color)
print(" SPIP", end="")
style(f" article{s(total-index)}")
print(" to Markdown & YAML files")
# Print the title of the article being exported
style(
f" {index + 1}. "
+ ("EMPTY " if len(article.texte) < 1 else "")
+ f"{article.lang} "
# Query the DB to retrieve all sections without parent, sorted by publication date
def root_sections(limit: int = 10**3) -> ModelSelect:
return (
Rubrique.select()
.where(Rubrique.id_parent == 0)
.order_by(Rubrique.date.desc())
.limit(limit)
)
highlight(article.titre, *unknown_chars(article.titre))
# Define the full article path & create directory(ies) if needed
articledir: str = sectiondir + "/" + article.slug()
makedirs(articledir, exist_ok=True)
# Define the article filename & write the article at the filename
articlepath: str = articledir + "/" + article.filename()
with open(articlepath, "w") as f:
f.write(article.content())
# Print export location when finished exporting
style(" -> ", BO, color)
print(articlepath)
return articledir
# Output information about ongoing export & copy document to output destination
def write_document(
index: int, total: int, document: Document, objectdir: str, indent_depth: int = 1
) -> None:
color = B # Associate documents to blue
if index % 100 == 0:
indent(indent_depth)
print("Exporting", end="")
style(f" {total-index}", BO, color)
style(f" document{s(total-index)}\n")
# Print the name of the file with a counter
indent(indent_depth)
style(f"{index + 1}. {document.media} ")
if len(document.titre) > 0:
highlight(document.titre + " ", *unknown_chars(document.titre))
style("at ")
print(document.fichier, end="")
# Define document path
documentpath: str = expanduser(config.data_dir + "/" + document.fichier)
# Copy the document from its SPIP location to the new location
try:
copyfile(documentpath, objectdir + "/" + document.slug())
except FileNotFoundError:
style(" -> NOT FOUND!\n", BO, R)
else:
# Print the outputted files path when copied the file
style(" ->", BO, color)
print(f" {objectdir}/{document.slug()}")
# Return true if an article field contains an unknown character
def has_unknown_chars(article: Article) -> bool:
if len(unknown_chars_context(article.texte)) > 0:
return True
@ -198,39 +77,13 @@ if __name__ == "__main__":
# Make a list containing articles where unknown characters are detected
unknown_chars_articles: list[Article] = []
# Get sections with an eventual maximum
sections = get_sections(max_sections_export)
nb_sections_export: int = len(sections)
# Write each root sections with its subtree
for section in root_sections(max_sections_export):
section.write()
print() # Break line after exporting the section
# Loop among sections & export them
for i, section in enumerate(sections):
# Get sections documents & link them
documents = section.documents()
# Write the section and store its output directory
sectiondir = write_section(i, nb_sections_export, section)
# Loop over sections related documents (images …)
for i, document in enumerate(documents):
write_document(i, len(documents), document, sectiondir)
# Loop over sections articles
articles = get_articles(section.id_rubrique, (max_articles_export))
for i, article in enumerate(articles):
# Get articles documents & link them
documents = article.documents()
# Write the article and store its output directory
articledir = write_article(i, len(articles), article, sectiondir)
# Add article to unknown_chars_articles if needed
if has_unknown_chars(article):
unknown_chars_articles.append(article)
# Decrement export limit
max_articles_export -= 1
# Loop over articles related documents (images …)
for i, document in enumerate(documents):
write_document(i, len(documents), document, articledir, 2)
# Break line when finished exporting the section
print()
print() # Break line
# Loop through each article that contains an unknown character
print() # Break line between export & unknown characters warning
# Warn about each article that contains unknown(s) character(s)
for article in unknown_chars_articles:
warn_unknown_chars(article)

View File

@ -1,13 +1,15 @@
from os import makedirs
from os.path import basename, splitext
from re import I, compile, finditer
from re import finditer
from shutil import copyfile
from typing import Any
from peewee import ModelSelect
from peewee import Model, ModelSelect
from slugify import slugify
from yaml import dump
from config import config
from converters import convert, link_document
from converters import convert, link_document, unknown_chars
from database import (
SpipArticles,
SpipAuteurs,
@ -16,9 +18,50 @@ from database import (
SpipDocumentsLiens,
SpipRubriques,
)
from styling import BLUE, BOLD, GREEN, YELLOW, highlight, indent, ss, style
class Document(SpipDocuments):
class SpipWritable:
class Meta:
table_name: str
term_color: int
texte: str
lang: str
titre: str
def filename(self, date: bool = False) -> str:
raise NotImplementedError("Subclasses need to implement filename()")
# Output information about file that will be exported
def begin_message(
self, index: int, limit: int, depth: int = 0, step: int = 100
) -> None:
# Print the remaining number of objects to export every step object
if index % step == 0:
indent(depth)
print("Exporting", end="")
style(f" {limit-index}", BOLD, self.term_color)
print(f" element{ss(limit-index)} from", end="")
style(f" {self.Meta.table_name}")
# Print the counter & title of the object being exported
indent(depth)
style(f"{index + 1}. ")
highlight(self.titre, *unknown_chars(self.titre))
# + ("EMPTY " if len(self.texte) < 1 else "")
# + f"{self.lang} "
# Write object to output destination
def write(self, export_dir: str) -> None:
raise NotImplementedError("Subclasses need to implement write()")
# Output information about file that was just exported
def end_message(self, export_dir: str):
style(" -> ", BOLD, self.term_color)
print(export_dir + self.filename())
class Document(SpipWritable, SpipDocuments):
class Meta:
table_name: str = "spip_documents"
@ -27,17 +70,32 @@ class Document(SpipDocuments):
self.titre: str = convert(self.titre, True)
self.descriptif: str = convert(self.descriptif, True)
self.statut: str = "false" if self.statut == "publie" else "true"
# Terminal output color
self.term_color: int = BLUE
def slug(self, date: bool = False) -> str:
# Get slugified name of this file
def filename(self, date: bool = False) -> str:
name_type: tuple[str, str] = splitext(basename(self.fichier))
return (
slugify((self.date_publication + "-" if date else "") + name_type[0])
+ name_type[1]
)
# Write document to output destination
def write(self, export_dir: str) -> None:
# Copy the document from its SPIP location to the new location
try:
copyfile(config.data_dir + self.fichier, export_dir + self.filename())
except FileNotFoundError:
raise FileNotFoundError(" -> NOT FOUND!\n") from None
class SpipObject:
class SpipObject(SpipWritable):
id: int
id_trad: int
date: str
maj: str
id_secteur: int
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@ -48,10 +106,16 @@ class SpipObject:
self.statut: str = "false" if self.statut == "publie" else "true"
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
self.extra: str = convert(self.extra) # Probably unused
# Define file prefix (need to be changed later)
# Define file prefix (needs to be redefined for sections)
self.prefix = "index"
def documents(self) -> ModelSelect:
# Convert SPIP style internal links for images & other files into Markdown style
def link_documents(self, documents: ModelSelect) -> None:
for d in documents:
self.texte = link_document(self.texte, d.id_document, d.titre, d.slug())
# Output related documents & link them in the text by the way
def documents(self, link_documents: bool = True) -> ModelSelect:
documents = (
Document.select()
.join(
@ -60,23 +124,44 @@ class SpipObject:
)
.where(SpipDocumentsLiens.id_objet == self.id)
)
for d in documents:
self.texte = link_document(self.texte, d.id_document, d.titre, d.slug())
# Internal (articles) links
self.text = link_articles(self.texte)
if link_documents:
self.link_documents(documents)
return documents
def slug(self, date: bool = False) -> str:
return slugify((self.date + "-" if date else "") + self.titre)
# Convert SPIP style internal links for other articles or sections into Markdown
def link_articles(self) -> None:
for match in finditer(r"\[(.*?)]\((?:art|article)([0-9]+)\)", self.texte):
article = Article.get(Article.id_article == match.group(2))
if len(match.group(1)) > 0:
title: str = match.group(1)
else:
title: str = article.titre
self.texte = self.texte.replace(
match.group(0), f"[{title}]({article.slug()}/{article.filename()})"
)
# Output related articles
def articles(self) -> ModelSelect:
return (
Article.select()
.where(Article.id_rubrique == self.id)
.order_by(Article.date.desc())
# .limit(limit)
)
# Get slugified directory of this object
def dir_slug(self, include_date: bool = False, end_slash: bool = True) -> str:
date: str = self.date + "-" if include_date else ""
slash: str = "/" if end_slash else ""
return slugify(date + self.titre) + slash
# Get filename of this object
def filename(self) -> str:
return self.prefix + "." + self.lang + "." + config.export_filetype
def frontmatter(self) -> str:
raise NotImplementedError("Subclasses must implement 'frontmatter' method.")
def common_frontmatter(self) -> dict[str, Any]:
return {
# Get the YAML frontmatter string
def frontmatter(self, append: dict[str, Any] = {}) -> str:
meta: dict[str, Any] = {
"lang": self.lang,
"translationKey": self.id_trad,
"title": self.titre,
@ -88,9 +173,12 @@ class SpipObject:
"spip_id_secteur": self.id_secteur,
"spip_id": self.id,
}
return dump(meta | append, allow_unicode=True)
def body(self) -> str:
body: str = ""
# Get file text content
def content(self) -> str:
# Start the content with frontmatter
body: str = "---\n" + self.frontmatter() + "---"
# Add the title as a Markdown h1
if len(self.titre) > 0 and config.prepend_h1:
body += "\n\n# " + self.titre
@ -103,9 +191,10 @@ class SpipObject:
body += "\n\n# EXTRA\n\n" + self.extra
return body
def content(self) -> str:
# Return the final article text
return "---\n" + self.frontmatter() + "---" + self.body()
# Write object to output destination
def write(self, export_dir: str) -> None:
with open(export_dir + self.filename(), "w") as f:
f.write(self.content())
class Article(SpipObject, SpipArticles):
@ -122,11 +211,12 @@ class Article(SpipObject, SpipArticles):
self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
# ID
self.id = self.id_article
# Terminal output color
self.term_color = YELLOW
def frontmatter(self) -> str:
return dump(
def frontmatter(self, append: dict[str, Any] = {}) -> str:
return super().frontmatter(
{
**super().common_frontmatter(),
# Article specific
"summary": self.chapo,
"surtitle": self.surtitre,
@ -135,12 +225,11 @@ class Article(SpipObject, SpipArticles):
"authors": [author.nom for author in self.authors()],
# Debugging
"spip_id_rubrique": self.id_rubrique,
},
allow_unicode=True,
}
)
def body(self) -> str:
body: str = super().body()
def content(self) -> str:
body: str = super().content()
# If there is a caption, add the caption followed by a hr
if len(self.chapo) > 0:
body += "\n\n" + self.chapo + "\n\n***"
@ -163,29 +252,6 @@ class Article(SpipObject, SpipArticles):
)
# Query the DB to retrieve all articles sorted by publication date
def get_articles(section_id: int, limit: int = 10**6) -> ModelSelect:
return (
Article.select()
.where(Article.id_rubrique == section_id)
.order_by(Article.date.desc())
.limit(limit)
)
def link_articles(text: str):
for match in finditer(r"\[(.*?)]\((?:art|article)([0-9]+)\)", text):
article = Article.get(Article.id_article == match.group(2))
if len(match.group(1)) > 0:
title: str = match.group(1)
else:
title: str = article.titre
text = text.replace(
match.group(0), f"[{title}]({article.slug()}/{article.filename()})"
)
return text
class Rubrique(SpipObject, SpipRubriques):
class Meta:
table_name: str = "spip_rubriques"
@ -196,19 +262,14 @@ class Rubrique(SpipObject, SpipRubriques):
self.id = self.id_rubrique
# File prefix
self.prefix = "_index"
# Terminal output color
self.term_color = GREEN
def frontmatter(self) -> str:
return dump(
def frontmatter(self, append: dict[str, Any] = {}) -> str:
return super().frontmatter(
{
**super().common_frontmatter(),
# Debugging
"spip_id_parent": self.id_parent,
"spip_profondeur": self.profondeur,
},
allow_unicode=True,
}
)
# Query the DB to retrieve all sections sorted by publication date
def get_sections(limit: int = 10**6) -> ModelSelect:
return Rubrique.select().order_by(Rubrique.date.desc()).limit(limit)

46
spip2md/styling.py Normal file
View File

@ -0,0 +1,46 @@
# pyright: strict
# Define styles
BOLD = 1 # Bold
ITALIC = 3 # Italic
UNDER = 4 # Underline
# Define colors
RED = 91 # Red
GREEN = 92 # Green
YELLOW = 93 # Yellow
BLUE = 94 # Blue
C0 = 95 # Color
C1 = 96 # Color
C2 = 96 # Color
# Print a stylized string, without trailing newline
def style(string: str, *args: int, end: str = "") -> None:
esc = "\033[" # Terminal escape sequence, needs to be closed by "m"
if len(args) == 0:
params: str = "1;" # Defaults to bold
else:
params: str = ""
for a in args:
params += str(a) + ";"
print(esc + params[:-1] + "m" + string + esc + "0m", end=end)
# Print a string, highlighting every substring starting at start_stop[x][0] …
def highlight(string: str, *start_stop: tuple[int, int], end: str = "") -> None:
previous_stop = 0
for start, stop in start_stop:
print(string[previous_stop:start], end="")
style(string[start:stop], BOLD, RED)
previous_stop = stop
print(string[previous_stop:], end=end)
# Plural ?
def ss(nb: int) -> str:
return "s" if nb > 1 else ""
# Indent with 2 spaces
def indent(nb: int = 1) -> None:
for _ in range(nb):
print(" ", end="")