refactor to use Peewee objects extension in place of redefining every SPIP atribute

This commit is contained in:
Guilhem Fauré 2023-05-24 10:43:39 +02:00
parent 4d269357de
commit 13fa720562
5 changed files with 298 additions and 383 deletions

View File

@ -3,7 +3,7 @@ from re import I, S, compile, finditer, sub
from typing import Optional from typing import Optional
# SPIP syntax to Markdown # SPIP syntax to Markdown
spip_to_markdown = ( SPIP_TO_MARKDOWN = (
( # horizontal rule ( # horizontal rule
compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I), compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I),
# r"---", # r"---",
@ -40,6 +40,14 @@ spip_to_markdown = (
), ),
r"~\1~", r"~\1~",
), ),
( # images
compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I),
r"![](\1\2)",
),
( # documents & embeds
compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I),
r"[](\1\2)",
),
( # anchor ( # anchor
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I), compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
r"[\1](\2)", r"[\1](\2)",
@ -100,58 +108,20 @@ spip_to_markdown = (
), ),
r"\1", r"\1",
), ),
) ( # WARNING remove every html tag
compile(r"<\/?.*?>\s*", S | I),
spip_to_text = (
( # strong
compile(r"\{\{ *(.*?) *\}\}", S | I),
r"\1",
),
( # html strong
compile(r"<strong> *(.*?) *</strong>", S | I),
r"\1",
),
( # emphasis
compile(r"\{ *(.*?) *\}", S | I),
r"\1",
),
( # html emphasis
compile(r"<i> *(.*?) *<\/i>", S | I),
r"\1",
),
( # strikethrough
compile(
r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
S | I,
),
r"\1",
),
( # Keep only the first language in multi-language blocks
compile(
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
S | I,
),
r"\1",
),
( # remove every html tag
compile(r"<\/?.*?> *", S | I),
r"",
),
( # Remove beginning with angle bracket(s)
compile(r"^>+ +", S | I),
r"",
),
( # Remove beginning with a number followed by a dot
compile(r"^\d+\. +", S | I),
r"", r"",
), ),
) )
# HTML tag WARNING can be used to remove them all # Further cleaning for metadata texts such as titles or descriptions
html_tag = compile(r"<\/?.*?> *", S | I) SPIP_META_BLOAT = (
compile(r"^>+ +", S | I), # Remove beginning with angle bracket(s)
compile(r"^\d+\. +", S | I), # Remove beginning with a number followed by a dot
)
# Broken ISO encoding to proper UTF-8 # Broken ISO encoding to proper UTF-8
iso_to_utf = ( ISO_TO_UTF = (
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1 ( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
"’", "’",
r"", r"",
@ -264,82 +234,71 @@ iso_to_utf = (
) )
# WARNING unknown broken encoding # WARNING unknown broken encoding
unknown_iso = ( UNKNOWN_ISO = (
r"
", r"
",
r"∆", r"∆",
r"û", r"û",
) )
# Apply spip_to_markdown conversions to a text # Apply SPIP to Markdown & ISO to UTF conversions to a text, & eventually clean meta
def convert_body(text: Optional[str]) -> str: def convert(text: Optional[str], clean_meta: bool = False) -> str:
if text is None: if text is None:
return "" return ""
for spip, markdown in spip_to_markdown: for spip, markdown in SPIP_TO_MARKDOWN:
text = spip.sub(markdown, text) text = spip.sub(markdown, text)
for iso, utf in iso_to_utf: if clean_meta:
for bloat in SPIP_META_BLOAT:
text = bloat.sub("", text)
for iso, utf in ISO_TO_UTF:
text = text.replace(iso, utf) text = text.replace(iso, utf)
return text return text
# Apply spip_to_text conversions to a text # Replace images & files links in Markdown with real slugs of the actually linked files
def convert_meta(text: Optional[str]) -> str: def link_documents(text: str, documents: list[tuple[int, str, str]]) -> str:
if text is None:
return ""
for spip, metadata in spip_to_text:
text = spip.sub(metadata, text)
for iso, utf in iso_to_utf:
text = text.replace(iso, utf)
return text
# Replace images & documents in SPIP text with Markdown links with human-readable names
def convert_documents(text: str, documents: list[tuple[int, str, str]]) -> str:
for id, name, slug in documents: for id, name, slug in documents:
# Replace images that dont have a title written in text
text = sub( text = sub(
r"<(?:img|image)" + str(id) + r"(\|.*?)*>", r"\[]\((?:img|image)" + str(id) + r"(\|.*?)*\)",
f"![{name}]({slug})", f"![{name}]({slug})",
text, text,
) )
# Replace images that dont have a title written in text
text = sub( text = sub(
r"<(?:doc|emb)" + str(id) + r"(\|.*?)*>", r"\[]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)",
f"[{name}]({slug})", f"[{name}]({slug})",
text, text,
) )
# Replace images that already had a title in Markdown style link
text = sub( text = sub(
r"\[(.*?)\]\((?:doc|emb)" + str(id) + r"(\|.*?)*\)", r"\[(.+?)\]\((?:img|image)" + str(id) + r"(\|.*?)*\)",
f"![\\1]({slug})",
text,
)
# Replace documents that already had a title in Markdown style link
text = sub(
r"\[(.+?)\]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)",
f"[\\1]({slug})", f"[\\1]({slug})",
text, text,
) )
return text return text
# Replace unknown chars with empty strings (delete them)
def remove_unknown_chars(text: str) -> str:
for char in unknown_iso:
text.replace(char, "")
return text
# Replace HTML tags chars with empty strings (delete them)
def remove_tags(text: str) -> str:
return html_tag.sub("", text)
# Return a list of tuples giving the start and end of unknown substring in text # Return a list of tuples giving the start and end of unknown substring in text
def unknown_chars(text: str) -> list[tuple[int, int]]: def unknown_chars(text: str) -> list[tuple[int, int]]:
positions: list[tuple[int, int]] = [] positions: list[tuple[int, int]] = []
for char in unknown_iso: for char in UNKNOWN_ISO:
for match in finditer("(" + char + ")+", text): for match in finditer("(" + char + ")+", text):
positions.append((match.start(), match.end())) positions.append((match.start(), match.end()))
return positions return positions
# Return strings with unknown chards found in text, surrounded by context_length chars # Return strings with unknown chards found in text, surrounded by context_length chars
def get_unknown_chars(text: str, context_length: int = 20) -> list[str]: def unknown_chars_context(text: str, context_length: int = 20) -> list[str]:
errors: list[str] = [] errors: list[str] = []
context: str = r".{0," + str(context_length) + r"}" context: str = r".{0," + str(context_length) + r"}"
for char in unknown_iso: for char in UNKNOWN_ISO:
matches = finditer( matches = finditer(
context + r"(?=" + char + r")" + char + r".*?(?=\r?\n|$)", context + r"(?=" + char + r")" + char + r".*?(?=\r?\n|$)",
text, text,

View File

@ -1,4 +1,3 @@
# pyright: basic
# type: ignore # type: ignore
from peewee import ( from peewee import (
SQL, SQL,
@ -15,7 +14,7 @@ from peewee import (
TextField, TextField,
) )
db = MySQLDatabase(None) DB = MySQLDatabase(None)
# class UnknownField(object): # class UnknownField(object):
@ -25,7 +24,7 @@ db = MySQLDatabase(None)
class BaseModel(Model): class BaseModel(Model):
class Meta: class Meta:
database: MySQLDatabase = db database: MySQLDatabase = DB
class SpipArticles(BaseModel): class SpipArticles(BaseModel):

View File

@ -1,259 +0,0 @@
# pyright: strict
from os.path import basename, splitext
from typing import Any, Optional
from slugify import slugify
from yaml import dump
from converter import convert_body, convert_documents, convert_meta, remove_tags
from database import (
SpipArticles,
SpipAuteurs,
SpipAuteursLiens,
SpipDocuments,
SpipDocumentsLiens,
SpipRubriques,
)
EXPORTTYPE: str = "md"
class Iterator:
items: list[Any]
def __init__(self) -> None:
# Set the limit at the number of retrieved items
self.LIMIT: int = len(self.items)
# Start before the first element
self.count: int = -1
def __iter__(self):
return self
def __len__(self) -> int:
return self.LIMIT
def remaining(self) -> int:
return self.LIMIT - self.count
def __next__(self) -> Any:
self.count += 1
if self.remaining() <= 0:
raise StopIteration
return self.items[self.count]
class Document:
def __init__(self, document: SpipDocuments) -> None:
self.id: int = document.id_document
self.thumbnail_id: int = document.id_vignette
self.title: str = convert_meta(document.titre)
self.date: str = document.date
self.description: str = convert_meta(document.descriptif)
self.file: str = document.fichier
self.draft: bool = document.statut == "publie"
self.creation: str = document.date
self.publication: str = document.date_publication
self.update: str = document.maj
self.media: str = document.media
def get_slug(self, date: bool = False) -> str:
name_type = splitext(basename(self.file))
return (
slugify((self.publication + "-" if date else "") + name_type[0])
+ name_type[1]
)
class Documents(Iterator):
def __init__(self, object_id: int) -> None:
# Query the DB to retrieve all documents related to object of id object_id
items = (
SpipDocuments.select()
.join(
SpipDocumentsLiens,
on=(SpipDocuments.id_document == SpipDocumentsLiens.id_document),
)
.where(SpipDocumentsLiens.id_objet == object_id)
)
self.items: list[Document] = [Document(i) for i in items]
super().__init__()
class Item:
id: int
def __init__(self, item: SpipArticles | SpipRubriques):
self.title: str = convert_meta(item.titre)
self.section_id: int = item.id_rubrique
self.description: str = convert_meta(item.descriptif)
self.text: str = convert_body(item.texte) # Convert SPIP to Markdown
self.publication: str = item.date
self.draft: bool = item.statut == "publie"
self.sector_id: int = item.id_secteur
self.update: str = item.maj
self.lang: str = item.lang
self.set_lang: bool = item.langue_choisie == "oui" # TODO Why?
self.translation_key: int = item.id_trad
self.extra: str = convert_body(item.extra) # Probably unused
def get_slug(self, date: bool = False) -> str:
return slugify((self.publication + "-" if date else "") + self.title)
def get_filename(self) -> str:
return "index" + "." + self.lang + "." + EXPORTTYPE
def get_frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
return dump(
{
"lang": self.lang,
"translationKey": self.translation_key,
"title": self.title,
"publishDate": self.publication,
"lastmod": self.update,
"draft": self.draft,
"description": self.description,
# Debugging
"spip_id": self.id,
"spip_id_secteur": self.sector_id,
}
| append
if append is not None
else {},
allow_unicode=True,
)
def get_body(self) -> str:
body: str = ""
# Add the title as a Markdown h1
if len(self.title) > 0:
body += "\n\n# " + self.title
# If there is a text, add the text preceded by two line breaks
if len(self.text) > 0:
# Convert images & files links
text: str = convert_documents(
self.text,
[(d.id, d.title, d.get_slug()) for d in self.get_documents()],
)
# Remove remaining HTML after & append to body
body += "\n\n" + remove_tags(text)
# Same with an "extra" section
if len(self.extra) > 0:
body += "\n\n# EXTRA\n\n" + self.extra
return body
def get_content(self) -> str:
# Return the final article text
return "---\n" + self.get_frontmatter() + "---" + self.get_body()
def get_documents(self) -> Documents:
return Documents(self.id)
class Article(Item):
def __init__(self, article: SpipArticles):
super().__init__(article)
self.id: int = article.id_article
self.surtitle: str = convert_meta(article.surtitre) # Probably unused
self.subtitle: str = convert_meta(article.soustitre) # Probably unused
self.caption: str = convert_body(article.chapo) # Probably unused
self.ps: str = convert_body(article.ps) # Probably unused
self.update_2: str = article.date_modif # Probably unused duplicate of maj
self.creation: str = article.date_redac
self.forum: bool = article.accepter_forum == "oui" # TODO Why?
self.sitename: str = article.nom_site # Probably useless
self.virtual: str = article.virtuel # TODO Why?
self.microblog: str = article.microblog # Probably unused
# self.export = article.export # USELESS
# self.views: int = article.visites # USELESS in static
# self.referers: int = article.referers # USELESS in static
# self.popularity: float = article.popularite # USELESS in static
# self.version = article.id_version # USELESS
def get_authors(self) -> list[SpipAuteurs]:
return (
SpipAuteurs.select()
.join(
SpipAuteursLiens,
on=(SpipAuteurs.id_auteur == SpipAuteursLiens.id_auteur),
)
.where(SpipAuteursLiens.id_objet == self.id)
)
def get_frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
return super().get_frontmatter(
{
"surtitle": self.surtitle,
"subtitle": self.subtitle,
"date": self.creation,
"authors": [author.nom for author in self.get_authors()],
# Debugging
"spip_id_rubrique": self.section_id,
"spip_id_secteur": self.sector_id,
"spip_chapo": self.caption,
}
| append
if append is not None
else {},
)
def get_body(self) -> str:
body: str = super().get_body()
# If there is a caption, add the caption followed by a hr
if hasattr(self, "caption") and len(self.caption) > 0:
body += "\n\n" + self.caption + "\n\n***"
# PS
if hasattr(self, "ps") and len(self.ps) > 0:
body += "\n\n# POST-SCRIPTUM\n\n" + self.ps
# Microblog
if hasattr(self, "microblog") and len(self.microblog) > 0:
body += "\n\n# MICROBLOGGING\n\n" + self.microblog
return body
class Section(Item):
def __init__(self, section: SpipRubriques):
super().__init__(section)
self.id: int = section.id_rubrique
self.parent_id: int = section.id_parent
self.depth: int = section.profondeur
self.agenda: int = section.agenda
def get_filename(self) -> str:
return "_" + super().get_filename()
def get_articles(self, limit: int = 0):
return Articles(self.id, limit)
class Articles(Iterator):
def __init__(self, section_id: int, limit: int = 0):
# Query the DB to retrieve all articles sorted by publication date
if limit > 0:
items = (
SpipArticles.select()
.where(SpipArticles.id_rubrique == section_id)
.order_by(SpipArticles.date.desc())
.limit(limit)
)
else:
items = (
SpipArticles.select()
.where(SpipArticles.id_rubrique == section_id)
.order_by(SpipArticles.date.desc())
)
self.items: list[Article] = [Article(i) for i in items]
super().__init__()
class Sections(Iterator):
def __init__(self, limit: int = 0):
# Query the DB to retrieve all sections sorted by publication date
if limit > 0:
items = (
SpipRubriques.select().order_by(SpipRubriques.date.desc()).limit(limit)
)
else:
items = SpipRubriques.select().order_by(SpipRubriques.date.desc())
self.items: list[Section] = [Section(i) for i in items]
super().__init__()

View File

@ -1,18 +1,19 @@
#!python #!python
# pyright: strict
from os import makedirs from os import makedirs
from os.path import expanduser from os.path import expanduser
from shutil import copyfile, rmtree from shutil import copyfile, rmtree
from sys import argv from sys import argv
from config import config from config import config
from converter import get_unknown_chars, unknown_chars from converters import unknown_chars, unknown_chars_context
from database import db from database import DB
from items import ( from spipobjects import (
Article, Article,
Document, Document,
Section, Rubrique,
Sections, get_articles,
get_documents,
get_sections,
) )
@ -64,26 +65,27 @@ def indent(nb: int = 1) -> None:
# Connect to the MySQL database with Peewee ORM # Connect to the MySQL database with Peewee ORM
db.init(config.db, host=config.db_host, user=config.db_user, password=config.db_pass) DB.init(config.db, host=config.db_host, user=config.db_user, password=config.db_pass)
db.connect() DB.connect()
# Output information about ongoing export & write section to output destination # Output information about ongoing export & write section to output destination
def write_section(index: int, total: int, section: Section) -> str: def write_section(index: int, total: int, section: Rubrique) -> str:
color = G # Associate sections to green
# Print the name of the exported section & number of remaining sections # Print the name of the exported section & number of remaining sections
style(f"{index + 1}. ", BO) style(f"{index + 1}. ", BO)
highlight(section.title, *unknown_chars(section.title)) highlight(section.titre, *unknown_chars(section.titre))
style(f" {total-index-1}", BO, G) style(f" {total-index-1}", BO, color)
style(f" section{s(total-index)} left") style(f" section{s(total-index)} left")
# Define the sections path (directory) & create directory(ies) if needed # Define the sections path (directory) & create directory(ies) if needed
sectiondir: str = config.output_dir + "/" + section.get_slug() sectiondir: str = config.output_dir + "/" + section.slug()
makedirs(sectiondir, exist_ok=True) makedirs(sectiondir, exist_ok=True)
# Define the section filename & write the index at that filename # Define the section filename & write the index at that filename
sectionpath: str = sectiondir + "/" + section.get_filename() sectionpath: str = sectiondir + "/" + section.filename()
with open(sectionpath, "w") as f: with open(sectionpath, "w") as f:
f.write(section.get_content()) f.write(section.content())
# Print export location when finished exporting # Print export location when finished exporting
style(" -> ", BO, G) style(" -> ", BO, color)
print(sectionpath) print(sectionpath)
# Return the first "limit" articles of section # Return the first "limit" articles of section
return sectiondir return sectiondir
@ -91,30 +93,31 @@ def write_section(index: int, total: int, section: Section) -> str:
# Output information about ongoing export & write article to output destination # Output information about ongoing export & write article to output destination
def write_article(index: int, total: int, article: Article, sectiondir: str) -> str: def write_article(index: int, total: int, article: Article, sectiondir: str) -> str:
color = Y # Associate articles to yellow
# Print the remaining number of articles to export every 100 articles # Print the remaining number of articles to export every 100 articles
if index % 100 == 0: if index % 100 == 0:
indent() indent()
print("Exporting", end="") print("Exporting", end="")
style(f" {total-index}", BO, Y) style(f" {total-index}", BO, color)
print(" SPIP", end="") print(" SPIP", end="")
style(f" article{s(total-index)}") style(f" article{s(total-index)}")
print(" to Markdown & YAML files") print(" to Markdown & YAML files")
# Print the title of the article being exported # Print the title of the article being exported
style( style(
f" {index + 1}. " f" {index + 1}. "
+ ("EMPTY " if len(article.text) < 1 else "") + ("EMPTY " if len(article.texte) < 1 else "")
+ f"{article.lang} " + f"{article.lang} "
) )
highlight(article.title, *unknown_chars(article.title)) highlight(article.titre, *unknown_chars(article.titre))
# Define the full article path & create directory(ies) if needed # Define the full article path & create directory(ies) if needed
articledir: str = sectiondir + "/" + article.get_slug() articledir: str = sectiondir + "/" + article.slug()
makedirs(articledir, exist_ok=True) makedirs(articledir, exist_ok=True)
# Define the article filename & write the article at the filename # Define the article filename & write the article at the filename
articlepath: str = articledir + "/" + article.get_filename() articlepath: str = articledir + "/" + article.filename()
with open(articlepath, "w") as f: with open(articlepath, "w") as f:
f.write(article.get_content()) f.write(article.content())
# Print export location when finished exporting # Print export location when finished exporting
style(" -> ", BO, B) style(" -> ", BO, color)
print(articlepath) print(articlepath)
return articledir return articledir
@ -123,34 +126,35 @@ def write_article(index: int, total: int, article: Article, sectiondir: str) ->
def write_document( def write_document(
index: int, total: int, document: Document, objectdir: str, indent_depth: int = 1 index: int, total: int, document: Document, objectdir: str, indent_depth: int = 1
) -> None: ) -> None:
color = B # Associate documents to blue
if index % 100 == 0: if index % 100 == 0:
indent(indent_depth) indent(indent_depth)
print("Exporting", end="") print("Exporting", end="")
style(f" {total-index}", BO, B) style(f" {total-index}", BO, color)
style(f" document{s(total-index)}\n") style(f" document{s(total-index)}\n")
# Print the name of the file with a counter # Print the name of the file with a counter
indent(indent_depth) indent(indent_depth)
style(f"{index + 1}. {document.media} ") style(f"{index + 1}. {document.media} ")
if len(document.title) > 0: if len(document.titre) > 0:
highlight(document.title + " ", *unknown_chars(document.title)) highlight(document.titre + " ", *unknown_chars(document.titre))
style("at ") style("at ")
print(document.file, end="") print(document.fichier, end="")
# Define document path # Define document path
documentpath: str = expanduser(config.data_dir + "/" + document.file) documentpath: str = expanduser(config.data_dir + "/" + document.fichier)
# Copy the document from its SPIP location to the new location # Copy the document from its SPIP location to the new location
try: try:
copyfile(documentpath, objectdir + "/" + document.get_slug()) copyfile(documentpath, objectdir + "/" + document.slug())
except FileNotFoundError: except FileNotFoundError:
style(" -> NOT FOUND!\n", BO, R) style(" -> NOT FOUND!\n", BO, R)
else: else:
# Print the outputted files path when copied the file # Print the outputted files path when copied the file
style(" ->", BO, B) style(" ->", BO, color)
print(f" {objectdir}/{document.get_slug()}") print(f" {objectdir}/{document.slug()}")
# Return true if an article field contains an unknown character # Return true if an article field contains an unknown character
def has_unknown_chars(article: Article) -> bool: def has_unknown_chars(article: Article) -> bool:
if len(get_unknown_chars(article.text)) > 0: if len(unknown_chars_context(article.texte)) > 0:
return True return True
return False return False
@ -159,13 +163,13 @@ def has_unknown_chars(article: Article) -> bool:
def warn_unknown_chars(article: Article) -> None: def warn_unknown_chars(article: Article) -> None:
# Print the title of the article in which there is unknown characters # Print the title of the article in which there is unknown characters
# & the number of them # & the number of them
unknown_chars_apparitions: list[str] = get_unknown_chars(article.text) unknown_chars_apparitions: list[str] = unknown_chars_context(article.texte)
nb: int = len(unknown_chars_apparitions) nb: int = len(unknown_chars_apparitions)
s: str = "s" if nb > 1 else "" s: str = "s" if nb > 1 else ""
style(f"{nb}") style(f"{nb}")
print(f" unknown character{s} in", end="") print(f" unknown character{s} in", end="")
style(f" {article.lang} ") style(f" {article.lang} ")
highlight(article.title, *unknown_chars(article.title)) highlight(article.titre, *unknown_chars(article.titre))
print() # Break line print() # Break line
# Print the context in which the unknown characters are found # Print the context in which the unknown characters are found
for text in unknown_chars_apparitions: for text in unknown_chars_apparitions:
@ -197,7 +201,7 @@ if __name__ == "__main__":
unknown_chars_articles: list[Article] = [] unknown_chars_articles: list[Article] = []
# Get sections with an eventual maximum # Get sections with an eventual maximum
sections = Sections(max_sections_export) sections = get_sections(max_sections_export)
nb_sections_export: int = len(sections) nb_sections_export: int = len(sections)
# Loop among sections & export them # Loop among sections & export them
@ -205,11 +209,11 @@ if __name__ == "__main__":
# Write the section & store its articles # Write the section & store its articles
sectiondir = write_section(i, nb_sections_export, section) sectiondir = write_section(i, nb_sections_export, section)
# Loop over sections related files (images …) # Loop over sections related files (images …)
documents = section.get_documents() documents = get_documents(section.id_rubrique)
for i, document in enumerate(documents): for i, document in enumerate(documents):
write_document(i, len(documents), document, sectiondir) write_document(i, len(documents), document, sectiondir)
# Loop over sections articles # Loop over sections articles
articles = section.get_articles(max_articles_export) articles = get_articles(section.id_rubrique, (max_articles_export))
for i, article in enumerate(articles): for i, article in enumerate(articles):
articledir = write_article(i, len(articles), article, sectiondir) articledir = write_article(i, len(articles), article, sectiondir)
# Add article to unknown_chars_articles if needed # Add article to unknown_chars_articles if needed
@ -218,7 +222,7 @@ if __name__ == "__main__":
# Decrement export limit # Decrement export limit
max_articles_export -= 1 max_articles_export -= 1
# Loop over articles related files (images …) # Loop over articles related files (images …)
documents = section.get_documents() documents = get_documents(article.id_article)
for i, document in enumerate(documents): for i, document in enumerate(documents):
write_document(i, len(documents), document, sectiondir, 2) write_document(i, len(documents), document, sectiondir, 2)
# Break line when finished exporting the section # Break line when finished exporting the section
@ -229,4 +233,4 @@ if __name__ == "__main__":
for article in unknown_chars_articles: for article in unknown_chars_articles:
warn_unknown_chars(article) warn_unknown_chars(article)
db.close() # Close the connection with the database DB.close() # Close the connection with the database

212
spip2md/spipobjects.py Normal file
View File

@ -0,0 +1,212 @@
from os.path import basename, splitext
from peewee import ModelSelect
from slugify import slugify
from yaml import dump
from converters import convert
from database import (
SpipArticles,
SpipAuteurs,
SpipAuteursLiens,
SpipDocuments,
SpipDocumentsLiens,
SpipRubriques,
)
EXPORTTYPE: str = "md"
# Convert images & files links
# text: str = convert_documents(
# self.texte,
# [(d.id, d.titre, d.slug()) for d in self.documents()],
# )
class Document(SpipDocuments):
class Meta:
table_name: str = "spip_documents"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.titre: str = convert(self.titre, True)
self.descriptif: str = convert(self.descriptif, True)
self.statut: str = "false" if self.statut == "publie" else "true"
def slug(self, date: bool = False) -> str:
name_type: tuple[str, str] = splitext(basename(self.fichier))
return (
slugify((self.date_publication + "-" if date else "") + name_type[0])
+ name_type[1]
)
class Article(SpipArticles):
class Meta:
table_name: str = "spip_articles"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.titre: str = convert(self.titre, True)
self.descriptif: str = convert(self.descriptif, True)
self.texte: str = convert(self.texte) # Convert SPIP to Markdown
self.statut: str = "false" if self.statut == "publie" else "true"
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
self.extra: str = convert(self.extra) # Probably unused
# Article specific
self.surtitle: str = convert(self.surtitre, True) # Probably unused
self.subtitle: str = convert(self.soustitre, True) # Probably unused
self.caption: str = convert(self.chapo) # Probably unused
self.ps: str = convert(self.ps) # Probably unused
self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
def slug(self, date: bool = False) -> str:
return slugify((self.date + "-" if date else "") + self.titre)
def filename(self) -> str:
return "index" + "." + self.lang + "." + EXPORTTYPE
def frontmatter(self) -> str:
return dump(
{
"lang": self.lang,
"translationKey": self.id_trad,
"title": self.titre,
"publishDate": self.date,
"lastmod": self.maj,
"draft": self.statut,
"description": self.descriptif,
# Debugging
"spip_id": self.id_article,
"spip_id_secteur": self.id_secteur,
# Article specific
"surtitle": self.surtitle,
"subtitle": self.subtitle,
"date": self.date_redac,
"authors": [author.nom for author in self.authors()],
# Debugging
"spip_id_rubrique": self.id_rubrique,
"spip_chapo": self.caption,
},
allow_unicode=True,
)
def body(self) -> str:
body: str = ""
# Add the title as a Markdown h1
if len(self.titre) > 0:
body += "\n\n# " + self.titre
# If there is a text, add the text preceded by two line breaks
if len(self.texte) > 0:
# Remove remaining HTML after & append to body
body += "\n\n"
# Same with an "extra" section
if len(self.extra) > 0:
body += "\n\n# EXTRA\n\n" + self.extra
# If there is a caption, add the caption followed by a hr
if hasattr(self, "caption") and len(self.caption) > 0:
body += "\n\n" + self.caption + "\n\n***"
# PS
if hasattr(self, "ps") and len(self.ps) > 0:
body += "\n\n# POST-SCRIPTUM\n\n" + self.ps
# Microblog
if hasattr(self, "microblog") and len(self.microblog) > 0:
body += "\n\n# MICROBLOGGING\n\n" + self.microblog
return body
def content(self) -> str:
# Return the final article text
return "---\n" + self.frontmatter() + "---" + self.body()
def authors(self) -> list[SpipAuteurs]:
return (
SpipAuteurs.select()
.join(
SpipAuteursLiens,
on=(SpipAuteurs.id_auteur == SpipAuteursLiens.id_auteur),
)
.where(SpipAuteursLiens.id_objet == self.id_article)
)
class Rubrique(SpipRubriques):
class Meta:
table_name: str = "spip_rubriques"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.titre: str = convert(self.titre, True)
self.descriptif: str = convert(self.descriptif, True)
self.texte: str = convert(self.texte) # Convert SPIP to Markdown
self.statut: str = "false" if self.statut == "publie" else "true"
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
self.extra: str = convert(self.extra) # Probably unused
def slug(self, date: bool = False) -> str:
return slugify((self.date + "-" if date else "") + self.titre)
def filename(self) -> str:
return "index" + "." + self.lang + "." + EXPORTTYPE
def frontmatter(self) -> str:
return dump(
{
"lang": self.lang,
"translationKey": self.id_trad,
"title": self.titre,
"publishDate": self.date,
"lastmod": self.maj,
"draft": self.statut,
"description": self.descriptif,
# Debugging
"spip_id": self.id_rubrique,
"spip_id_secteur": self.id_secteur,
},
allow_unicode=True,
)
def body(self) -> str:
body: str = ""
# Add the title as a Markdown h1
if len(self.titre) > 0:
body += "\n\n# " + self.titre
# If there is a text, add the text preceded by two line breaks
if len(self.texte) > 0:
# Remove remaining HTML after & append to body
body += "\n\n"
# Same with an "extra" section
if len(self.extra) > 0:
body += "\n\n# EXTRA\n\n" + self.extra
return body
def content(self) -> str:
# Return the final article text
return "---\n" + self.frontmatter() + "---" + self.body()
# Query the DB to retrieve all sections sorted by publication date
def get_sections(limit: int = 10**6) -> ModelSelect:
return Rubrique.select().order_by(Rubrique.date.desc()).limit(limit)
# Query the DB to retrieve all articles sorted by publication date
def get_articles(section_id: int, limit: int = 10**6) -> ModelSelect:
return (
Article.select()
.where(Article.id_rubrique == section_id)
.order_by(Article.date.desc())
.limit(limit)
)
# Query the DB to retrieve all documents related to object of id object_id
def get_documents(object_id: int, limit: int = 10**6) -> ModelSelect:
return (
Document.select()
.join(
SpipDocumentsLiens,
on=(Document.id_document == SpipDocumentsLiens.id_document),
)
.where(SpipDocumentsLiens.id_objet == object_id)
.limit(limit)
)