refactor to use Peewee objects extension in place of redefining every SPIP atribute
This commit is contained in:
parent
4d269357de
commit
13fa720562
@ -3,7 +3,7 @@ from re import I, S, compile, finditer, sub
|
||||
from typing import Optional
|
||||
|
||||
# SPIP syntax to Markdown
|
||||
spip_to_markdown = (
|
||||
SPIP_TO_MARKDOWN = (
|
||||
( # horizontal rule
|
||||
compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I),
|
||||
# r"---",
|
||||
@ -40,6 +40,14 @@ spip_to_markdown = (
|
||||
),
|
||||
r"~\1~",
|
||||
),
|
||||
( # images
|
||||
compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I),
|
||||
r"![](\1\2)",
|
||||
),
|
||||
( # documents & embeds
|
||||
compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I),
|
||||
r"[](\1\2)",
|
||||
),
|
||||
( # anchor
|
||||
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
|
||||
r"[\1](\2)",
|
||||
@ -100,58 +108,20 @@ spip_to_markdown = (
|
||||
),
|
||||
r"\1",
|
||||
),
|
||||
)
|
||||
|
||||
spip_to_text = (
|
||||
( # strong
|
||||
compile(r"\{\{ *(.*?) *\}\}", S | I),
|
||||
r"\1",
|
||||
),
|
||||
( # html strong
|
||||
compile(r"<strong> *(.*?) *</strong>", S | I),
|
||||
r"\1",
|
||||
),
|
||||
( # emphasis
|
||||
compile(r"\{ *(.*?) *\}", S | I),
|
||||
r"\1",
|
||||
),
|
||||
( # html emphasis
|
||||
compile(r"<i> *(.*?) *<\/i>", S | I),
|
||||
r"\1",
|
||||
),
|
||||
( # strikethrough
|
||||
compile(
|
||||
r"<del>\s*(.*?)\s*(?:(\r?\n){2,}|<\/del>)",
|
||||
S | I,
|
||||
),
|
||||
r"\1",
|
||||
),
|
||||
( # Keep only the first language in multi-language blocks
|
||||
compile(
|
||||
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
|
||||
S | I,
|
||||
),
|
||||
r"\1",
|
||||
),
|
||||
( # remove every html tag
|
||||
compile(r"<\/?.*?> *", S | I),
|
||||
r"",
|
||||
),
|
||||
( # Remove beginning with angle bracket(s)
|
||||
compile(r"^>+ +", S | I),
|
||||
r"",
|
||||
),
|
||||
( # Remove beginning with a number followed by a dot
|
||||
compile(r"^\d+\. +", S | I),
|
||||
( # WARNING remove every html tag
|
||||
compile(r"<\/?.*?>\s*", S | I),
|
||||
r"",
|
||||
),
|
||||
)
|
||||
|
||||
# HTML tag WARNING can be used to remove them all
|
||||
html_tag = compile(r"<\/?.*?> *", S | I)
|
||||
# Further cleaning for metadata texts such as titles or descriptions
|
||||
SPIP_META_BLOAT = (
|
||||
compile(r"^>+ +", S | I), # Remove beginning with angle bracket(s)
|
||||
compile(r"^\d+\. +", S | I), # Remove beginning with a number followed by a dot
|
||||
)
|
||||
|
||||
# Broken ISO encoding to proper UTF-8
|
||||
iso_to_utf = (
|
||||
ISO_TO_UTF = (
|
||||
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
|
||||
"’",
|
||||
r"’",
|
||||
@ -264,82 +234,71 @@ iso_to_utf = (
|
||||
)
|
||||
|
||||
# WARNING unknown broken encoding
|
||||
unknown_iso = (
|
||||
UNKNOWN_ISO = (
|
||||
r"
",
|
||||
r"∆",
|
||||
r"û",
|
||||
)
|
||||
|
||||
|
||||
# Apply spip_to_markdown conversions to a text
|
||||
def convert_body(text: Optional[str]) -> str:
|
||||
# Apply SPIP to Markdown & ISO to UTF conversions to a text, & eventually clean meta
|
||||
def convert(text: Optional[str], clean_meta: bool = False) -> str:
|
||||
if text is None:
|
||||
return ""
|
||||
for spip, markdown in spip_to_markdown:
|
||||
for spip, markdown in SPIP_TO_MARKDOWN:
|
||||
text = spip.sub(markdown, text)
|
||||
for iso, utf in iso_to_utf:
|
||||
if clean_meta:
|
||||
for bloat in SPIP_META_BLOAT:
|
||||
text = bloat.sub("", text)
|
||||
for iso, utf in ISO_TO_UTF:
|
||||
text = text.replace(iso, utf)
|
||||
return text
|
||||
|
||||
|
||||
# Apply spip_to_text conversions to a text
|
||||
def convert_meta(text: Optional[str]) -> str:
|
||||
if text is None:
|
||||
return ""
|
||||
for spip, metadata in spip_to_text:
|
||||
text = spip.sub(metadata, text)
|
||||
for iso, utf in iso_to_utf:
|
||||
text = text.replace(iso, utf)
|
||||
return text
|
||||
|
||||
|
||||
# Replace images & documents in SPIP text with Markdown links with human-readable names
|
||||
def convert_documents(text: str, documents: list[tuple[int, str, str]]) -> str:
|
||||
# Replace images & files links in Markdown with real slugs of the actually linked files
|
||||
def link_documents(text: str, documents: list[tuple[int, str, str]]) -> str:
|
||||
for id, name, slug in documents:
|
||||
# Replace images that dont have a title written in text
|
||||
text = sub(
|
||||
r"<(?:img|image)" + str(id) + r"(\|.*?)*>",
|
||||
r"\[]\((?:img|image)" + str(id) + r"(\|.*?)*\)",
|
||||
f"![{name}]({slug})",
|
||||
text,
|
||||
)
|
||||
# Replace images that dont have a title written in text
|
||||
text = sub(
|
||||
r"<(?:doc|emb)" + str(id) + r"(\|.*?)*>",
|
||||
r"\[]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)",
|
||||
f"[{name}]({slug})",
|
||||
text,
|
||||
)
|
||||
# Replace images that already had a title in Markdown style link
|
||||
text = sub(
|
||||
r"\[(.*?)\]\((?:doc|emb)" + str(id) + r"(\|.*?)*\)",
|
||||
r"\[(.+?)\]\((?:img|image)" + str(id) + r"(\|.*?)*\)",
|
||||
f"![\\1]({slug})",
|
||||
text,
|
||||
)
|
||||
# Replace documents that already had a title in Markdown style link
|
||||
text = sub(
|
||||
r"\[(.+?)\]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)",
|
||||
f"[\\1]({slug})",
|
||||
text,
|
||||
)
|
||||
return text
|
||||
|
||||
|
||||
# Replace unknown chars with empty strings (delete them)
|
||||
def remove_unknown_chars(text: str) -> str:
|
||||
for char in unknown_iso:
|
||||
text.replace(char, "")
|
||||
return text
|
||||
|
||||
|
||||
# Replace HTML tags chars with empty strings (delete them)
|
||||
def remove_tags(text: str) -> str:
|
||||
return html_tag.sub("", text)
|
||||
|
||||
|
||||
# Return a list of tuples giving the start and end of unknown substring in text
|
||||
def unknown_chars(text: str) -> list[tuple[int, int]]:
|
||||
positions: list[tuple[int, int]] = []
|
||||
for char in unknown_iso:
|
||||
for char in UNKNOWN_ISO:
|
||||
for match in finditer("(" + char + ")+", text):
|
||||
positions.append((match.start(), match.end()))
|
||||
return positions
|
||||
|
||||
|
||||
# Return strings with unknown chards found in text, surrounded by context_length chars
|
||||
def get_unknown_chars(text: str, context_length: int = 20) -> list[str]:
|
||||
def unknown_chars_context(text: str, context_length: int = 20) -> list[str]:
|
||||
errors: list[str] = []
|
||||
context: str = r".{0," + str(context_length) + r"}"
|
||||
for char in unknown_iso:
|
||||
for char in UNKNOWN_ISO:
|
||||
matches = finditer(
|
||||
context + r"(?=" + char + r")" + char + r".*?(?=\r?\n|$)",
|
||||
text,
|
@ -1,4 +1,3 @@
|
||||
# pyright: basic
|
||||
# type: ignore
|
||||
from peewee import (
|
||||
SQL,
|
||||
@ -15,7 +14,7 @@ from peewee import (
|
||||
TextField,
|
||||
)
|
||||
|
||||
db = MySQLDatabase(None)
|
||||
DB = MySQLDatabase(None)
|
||||
|
||||
|
||||
# class UnknownField(object):
|
||||
@ -25,7 +24,7 @@ db = MySQLDatabase(None)
|
||||
|
||||
class BaseModel(Model):
|
||||
class Meta:
|
||||
database: MySQLDatabase = db
|
||||
database: MySQLDatabase = DB
|
||||
|
||||
|
||||
class SpipArticles(BaseModel):
|
||||
|
259
spip2md/items.py
259
spip2md/items.py
@ -1,259 +0,0 @@
|
||||
# pyright: strict
|
||||
from os.path import basename, splitext
|
||||
from typing import Any, Optional
|
||||
|
||||
from slugify import slugify
|
||||
from yaml import dump
|
||||
|
||||
from converter import convert_body, convert_documents, convert_meta, remove_tags
|
||||
from database import (
|
||||
SpipArticles,
|
||||
SpipAuteurs,
|
||||
SpipAuteursLiens,
|
||||
SpipDocuments,
|
||||
SpipDocumentsLiens,
|
||||
SpipRubriques,
|
||||
)
|
||||
|
||||
EXPORTTYPE: str = "md"
|
||||
|
||||
|
||||
class Iterator:
|
||||
items: list[Any]
|
||||
|
||||
def __init__(self) -> None:
|
||||
# Set the limit at the number of retrieved items
|
||||
self.LIMIT: int = len(self.items)
|
||||
# Start before the first element
|
||||
self.count: int = -1
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __len__(self) -> int:
|
||||
return self.LIMIT
|
||||
|
||||
def remaining(self) -> int:
|
||||
return self.LIMIT - self.count
|
||||
|
||||
def __next__(self) -> Any:
|
||||
self.count += 1
|
||||
if self.remaining() <= 0:
|
||||
raise StopIteration
|
||||
return self.items[self.count]
|
||||
|
||||
|
||||
class Document:
|
||||
def __init__(self, document: SpipDocuments) -> None:
|
||||
self.id: int = document.id_document
|
||||
self.thumbnail_id: int = document.id_vignette
|
||||
self.title: str = convert_meta(document.titre)
|
||||
self.date: str = document.date
|
||||
self.description: str = convert_meta(document.descriptif)
|
||||
self.file: str = document.fichier
|
||||
self.draft: bool = document.statut == "publie"
|
||||
self.creation: str = document.date
|
||||
self.publication: str = document.date_publication
|
||||
self.update: str = document.maj
|
||||
self.media: str = document.media
|
||||
|
||||
def get_slug(self, date: bool = False) -> str:
|
||||
name_type = splitext(basename(self.file))
|
||||
return (
|
||||
slugify((self.publication + "-" if date else "") + name_type[0])
|
||||
+ name_type[1]
|
||||
)
|
||||
|
||||
|
||||
class Documents(Iterator):
|
||||
def __init__(self, object_id: int) -> None:
|
||||
# Query the DB to retrieve all documents related to object of id object_id
|
||||
items = (
|
||||
SpipDocuments.select()
|
||||
.join(
|
||||
SpipDocumentsLiens,
|
||||
on=(SpipDocuments.id_document == SpipDocumentsLiens.id_document),
|
||||
)
|
||||
.where(SpipDocumentsLiens.id_objet == object_id)
|
||||
)
|
||||
self.items: list[Document] = [Document(i) for i in items]
|
||||
super().__init__()
|
||||
|
||||
|
||||
class Item:
|
||||
id: int
|
||||
|
||||
def __init__(self, item: SpipArticles | SpipRubriques):
|
||||
self.title: str = convert_meta(item.titre)
|
||||
self.section_id: int = item.id_rubrique
|
||||
self.description: str = convert_meta(item.descriptif)
|
||||
self.text: str = convert_body(item.texte) # Convert SPIP to Markdown
|
||||
self.publication: str = item.date
|
||||
self.draft: bool = item.statut == "publie"
|
||||
self.sector_id: int = item.id_secteur
|
||||
self.update: str = item.maj
|
||||
self.lang: str = item.lang
|
||||
self.set_lang: bool = item.langue_choisie == "oui" # TODO Why ?
|
||||
self.translation_key: int = item.id_trad
|
||||
self.extra: str = convert_body(item.extra) # Probably unused
|
||||
|
||||
def get_slug(self, date: bool = False) -> str:
|
||||
return slugify((self.publication + "-" if date else "") + self.title)
|
||||
|
||||
def get_filename(self) -> str:
|
||||
return "index" + "." + self.lang + "." + EXPORTTYPE
|
||||
|
||||
def get_frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
|
||||
return dump(
|
||||
{
|
||||
"lang": self.lang,
|
||||
"translationKey": self.translation_key,
|
||||
"title": self.title,
|
||||
"publishDate": self.publication,
|
||||
"lastmod": self.update,
|
||||
"draft": self.draft,
|
||||
"description": self.description,
|
||||
# Debugging
|
||||
"spip_id": self.id,
|
||||
"spip_id_secteur": self.sector_id,
|
||||
}
|
||||
| append
|
||||
if append is not None
|
||||
else {},
|
||||
allow_unicode=True,
|
||||
)
|
||||
|
||||
def get_body(self) -> str:
|
||||
body: str = ""
|
||||
# Add the title as a Markdown h1
|
||||
if len(self.title) > 0:
|
||||
body += "\n\n# " + self.title
|
||||
# If there is a text, add the text preceded by two line breaks
|
||||
if len(self.text) > 0:
|
||||
# Convert images & files links
|
||||
text: str = convert_documents(
|
||||
self.text,
|
||||
[(d.id, d.title, d.get_slug()) for d in self.get_documents()],
|
||||
)
|
||||
# Remove remaining HTML after & append to body
|
||||
body += "\n\n" + remove_tags(text)
|
||||
# Same with an "extra" section
|
||||
if len(self.extra) > 0:
|
||||
body += "\n\n# EXTRA\n\n" + self.extra
|
||||
return body
|
||||
|
||||
def get_content(self) -> str:
|
||||
# Return the final article text
|
||||
return "---\n" + self.get_frontmatter() + "---" + self.get_body()
|
||||
|
||||
def get_documents(self) -> Documents:
|
||||
return Documents(self.id)
|
||||
|
||||
|
||||
class Article(Item):
|
||||
def __init__(self, article: SpipArticles):
|
||||
super().__init__(article)
|
||||
self.id: int = article.id_article
|
||||
self.surtitle: str = convert_meta(article.surtitre) # Probably unused
|
||||
self.subtitle: str = convert_meta(article.soustitre) # Probably unused
|
||||
self.caption: str = convert_body(article.chapo) # Probably unused
|
||||
self.ps: str = convert_body(article.ps) # Probably unused
|
||||
self.update_2: str = article.date_modif # Probably unused duplicate of maj
|
||||
self.creation: str = article.date_redac
|
||||
self.forum: bool = article.accepter_forum == "oui" # TODO Why ?
|
||||
self.sitename: str = article.nom_site # Probably useless
|
||||
self.virtual: str = article.virtuel # TODO Why ?
|
||||
self.microblog: str = article.microblog # Probably unused
|
||||
# self.export = article.export # USELESS
|
||||
# self.views: int = article.visites # USELESS in static
|
||||
# self.referers: int = article.referers # USELESS in static
|
||||
# self.popularity: float = article.popularite # USELESS in static
|
||||
# self.version = article.id_version # USELESS
|
||||
|
||||
def get_authors(self) -> list[SpipAuteurs]:
|
||||
return (
|
||||
SpipAuteurs.select()
|
||||
.join(
|
||||
SpipAuteursLiens,
|
||||
on=(SpipAuteurs.id_auteur == SpipAuteursLiens.id_auteur),
|
||||
)
|
||||
.where(SpipAuteursLiens.id_objet == self.id)
|
||||
)
|
||||
|
||||
def get_frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
|
||||
return super().get_frontmatter(
|
||||
{
|
||||
"surtitle": self.surtitle,
|
||||
"subtitle": self.subtitle,
|
||||
"date": self.creation,
|
||||
"authors": [author.nom for author in self.get_authors()],
|
||||
# Debugging
|
||||
"spip_id_rubrique": self.section_id,
|
||||
"spip_id_secteur": self.sector_id,
|
||||
"spip_chapo": self.caption,
|
||||
}
|
||||
| append
|
||||
if append is not None
|
||||
else {},
|
||||
)
|
||||
|
||||
def get_body(self) -> str:
|
||||
body: str = super().get_body()
|
||||
# If there is a caption, add the caption followed by a hr
|
||||
if hasattr(self, "caption") and len(self.caption) > 0:
|
||||
body += "\n\n" + self.caption + "\n\n***"
|
||||
# PS
|
||||
if hasattr(self, "ps") and len(self.ps) > 0:
|
||||
body += "\n\n# POST-SCRIPTUM\n\n" + self.ps
|
||||
# Microblog
|
||||
if hasattr(self, "microblog") and len(self.microblog) > 0:
|
||||
body += "\n\n# MICROBLOGGING\n\n" + self.microblog
|
||||
return body
|
||||
|
||||
|
||||
class Section(Item):
|
||||
def __init__(self, section: SpipRubriques):
|
||||
super().__init__(section)
|
||||
self.id: int = section.id_rubrique
|
||||
self.parent_id: int = section.id_parent
|
||||
self.depth: int = section.profondeur
|
||||
self.agenda: int = section.agenda
|
||||
|
||||
def get_filename(self) -> str:
|
||||
return "_" + super().get_filename()
|
||||
|
||||
def get_articles(self, limit: int = 0):
|
||||
return Articles(self.id, limit)
|
||||
|
||||
|
||||
class Articles(Iterator):
|
||||
def __init__(self, section_id: int, limit: int = 0):
|
||||
# Query the DB to retrieve all articles sorted by publication date
|
||||
if limit > 0:
|
||||
items = (
|
||||
SpipArticles.select()
|
||||
.where(SpipArticles.id_rubrique == section_id)
|
||||
.order_by(SpipArticles.date.desc())
|
||||
.limit(limit)
|
||||
)
|
||||
else:
|
||||
items = (
|
||||
SpipArticles.select()
|
||||
.where(SpipArticles.id_rubrique == section_id)
|
||||
.order_by(SpipArticles.date.desc())
|
||||
)
|
||||
self.items: list[Article] = [Article(i) for i in items]
|
||||
super().__init__()
|
||||
|
||||
|
||||
class Sections(Iterator):
|
||||
def __init__(self, limit: int = 0):
|
||||
# Query the DB to retrieve all sections sorted by publication date
|
||||
if limit > 0:
|
||||
items = (
|
||||
SpipRubriques.select().order_by(SpipRubriques.date.desc()).limit(limit)
|
||||
)
|
||||
else:
|
||||
items = SpipRubriques.select().order_by(SpipRubriques.date.desc())
|
||||
self.items: list[Section] = [Section(i) for i in items]
|
||||
super().__init__()
|
@ -1,18 +1,19 @@
|
||||
#!python
|
||||
# pyright: strict
|
||||
from os import makedirs
|
||||
from os.path import expanduser
|
||||
from shutil import copyfile, rmtree
|
||||
from sys import argv
|
||||
|
||||
from config import config
|
||||
from converter import get_unknown_chars, unknown_chars
|
||||
from database import db
|
||||
from items import (
|
||||
from converters import unknown_chars, unknown_chars_context
|
||||
from database import DB
|
||||
from spipobjects import (
|
||||
Article,
|
||||
Document,
|
||||
Section,
|
||||
Sections,
|
||||
Rubrique,
|
||||
get_articles,
|
||||
get_documents,
|
||||
get_sections,
|
||||
)
|
||||
|
||||
|
||||
@ -64,26 +65,27 @@ def indent(nb: int = 1) -> None:
|
||||
|
||||
|
||||
# Connect to the MySQL database with Peewee ORM
|
||||
db.init(config.db, host=config.db_host, user=config.db_user, password=config.db_pass)
|
||||
db.connect()
|
||||
DB.init(config.db, host=config.db_host, user=config.db_user, password=config.db_pass)
|
||||
DB.connect()
|
||||
|
||||
|
||||
# Output information about ongoing export & write section to output destination
|
||||
def write_section(index: int, total: int, section: Section) -> str:
|
||||
def write_section(index: int, total: int, section: Rubrique) -> str:
|
||||
color = G # Associate sections to green
|
||||
# Print the name of the exported section & number of remaining sections
|
||||
style(f"{index + 1}. ", BO)
|
||||
highlight(section.title, *unknown_chars(section.title))
|
||||
style(f" {total-index-1}", BO, G)
|
||||
highlight(section.titre, *unknown_chars(section.titre))
|
||||
style(f" {total-index-1}", BO, color)
|
||||
style(f" section{s(total-index)} left")
|
||||
# Define the section’s path (directory) & create directory(ies) if needed
|
||||
sectiondir: str = config.output_dir + "/" + section.get_slug()
|
||||
sectiondir: str = config.output_dir + "/" + section.slug()
|
||||
makedirs(sectiondir, exist_ok=True)
|
||||
# Define the section filename & write the index at that filename
|
||||
sectionpath: str = sectiondir + "/" + section.get_filename()
|
||||
sectionpath: str = sectiondir + "/" + section.filename()
|
||||
with open(sectionpath, "w") as f:
|
||||
f.write(section.get_content())
|
||||
f.write(section.content())
|
||||
# Print export location when finished exporting
|
||||
style(" -> ", BO, G)
|
||||
style(" -> ", BO, color)
|
||||
print(sectionpath)
|
||||
# Return the first "limit" articles of section
|
||||
return sectiondir
|
||||
@ -91,30 +93,31 @@ def write_section(index: int, total: int, section: Section) -> str:
|
||||
|
||||
# Output information about ongoing export & write article to output destination
|
||||
def write_article(index: int, total: int, article: Article, sectiondir: str) -> str:
|
||||
color = Y # Associate articles to yellow
|
||||
# Print the remaining number of articles to export every 100 articles
|
||||
if index % 100 == 0:
|
||||
indent()
|
||||
print("Exporting", end="")
|
||||
style(f" {total-index}", BO, Y)
|
||||
style(f" {total-index}", BO, color)
|
||||
print(" SPIP", end="")
|
||||
style(f" article{s(total-index)}")
|
||||
print(" to Markdown & YAML files")
|
||||
# Print the title of the article being exported
|
||||
style(
|
||||
f" {index + 1}. "
|
||||
+ ("EMPTY " if len(article.text) < 1 else "")
|
||||
+ ("EMPTY " if len(article.texte) < 1 else "")
|
||||
+ f"{article.lang} "
|
||||
)
|
||||
highlight(article.title, *unknown_chars(article.title))
|
||||
highlight(article.titre, *unknown_chars(article.titre))
|
||||
# Define the full article path & create directory(ies) if needed
|
||||
articledir: str = sectiondir + "/" + article.get_slug()
|
||||
articledir: str = sectiondir + "/" + article.slug()
|
||||
makedirs(articledir, exist_ok=True)
|
||||
# Define the article filename & write the article at the filename
|
||||
articlepath: str = articledir + "/" + article.get_filename()
|
||||
articlepath: str = articledir + "/" + article.filename()
|
||||
with open(articlepath, "w") as f:
|
||||
f.write(article.get_content())
|
||||
f.write(article.content())
|
||||
# Print export location when finished exporting
|
||||
style(" -> ", BO, B)
|
||||
style(" -> ", BO, color)
|
||||
print(articlepath)
|
||||
return articledir
|
||||
|
||||
@ -123,34 +126,35 @@ def write_article(index: int, total: int, article: Article, sectiondir: str) ->
|
||||
def write_document(
|
||||
index: int, total: int, document: Document, objectdir: str, indent_depth: int = 1
|
||||
) -> None:
|
||||
color = B # Associate documents to blue
|
||||
if index % 100 == 0:
|
||||
indent(indent_depth)
|
||||
print("Exporting", end="")
|
||||
style(f" {total-index}", BO, B)
|
||||
style(f" {total-index}", BO, color)
|
||||
style(f" document{s(total-index)}\n")
|
||||
# Print the name of the file with a counter
|
||||
indent(indent_depth)
|
||||
style(f"{index + 1}. {document.media} ")
|
||||
if len(document.title) > 0:
|
||||
highlight(document.title + " ", *unknown_chars(document.title))
|
||||
if len(document.titre) > 0:
|
||||
highlight(document.titre + " ", *unknown_chars(document.titre))
|
||||
style("at ")
|
||||
print(document.file, end="")
|
||||
print(document.fichier, end="")
|
||||
# Define document path
|
||||
documentpath: str = expanduser(config.data_dir + "/" + document.file)
|
||||
documentpath: str = expanduser(config.data_dir + "/" + document.fichier)
|
||||
# Copy the document from it’s SPIP location to the new location
|
||||
try:
|
||||
copyfile(documentpath, objectdir + "/" + document.get_slug())
|
||||
copyfile(documentpath, objectdir + "/" + document.slug())
|
||||
except FileNotFoundError:
|
||||
style(" -> NOT FOUND!\n", BO, R)
|
||||
else:
|
||||
# Print the outputted file’s path when copied the file
|
||||
style(" ->", BO, B)
|
||||
print(f" {objectdir}/{document.get_slug()}")
|
||||
style(" ->", BO, color)
|
||||
print(f" {objectdir}/{document.slug()}")
|
||||
|
||||
|
||||
# Return true if an article field contains an unknown character
|
||||
def has_unknown_chars(article: Article) -> bool:
|
||||
if len(get_unknown_chars(article.text)) > 0:
|
||||
if len(unknown_chars_context(article.texte)) > 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
@ -159,13 +163,13 @@ def has_unknown_chars(article: Article) -> bool:
|
||||
def warn_unknown_chars(article: Article) -> None:
|
||||
# Print the title of the article in which there is unknown characters
|
||||
# & the number of them
|
||||
unknown_chars_apparitions: list[str] = get_unknown_chars(article.text)
|
||||
unknown_chars_apparitions: list[str] = unknown_chars_context(article.texte)
|
||||
nb: int = len(unknown_chars_apparitions)
|
||||
s: str = "s" if nb > 1 else ""
|
||||
style(f"{nb}")
|
||||
print(f" unknown character{s} in", end="")
|
||||
style(f" {article.lang} ")
|
||||
highlight(article.title, *unknown_chars(article.title))
|
||||
highlight(article.titre, *unknown_chars(article.titre))
|
||||
print() # Break line
|
||||
# Print the context in which the unknown characters are found
|
||||
for text in unknown_chars_apparitions:
|
||||
@ -197,7 +201,7 @@ if __name__ == "__main__":
|
||||
unknown_chars_articles: list[Article] = []
|
||||
|
||||
# Get sections with an eventual maximum
|
||||
sections = Sections(max_sections_export)
|
||||
sections = get_sections(max_sections_export)
|
||||
nb_sections_export: int = len(sections)
|
||||
|
||||
# Loop among sections & export them
|
||||
@ -205,11 +209,11 @@ if __name__ == "__main__":
|
||||
# Write the section & store its articles
|
||||
sectiondir = write_section(i, nb_sections_export, section)
|
||||
# Loop over section’s related files (images …)
|
||||
documents = section.get_documents()
|
||||
documents = get_documents(section.id_rubrique)
|
||||
for i, document in enumerate(documents):
|
||||
write_document(i, len(documents), document, sectiondir)
|
||||
# Loop over section’s articles
|
||||
articles = section.get_articles(max_articles_export)
|
||||
articles = get_articles(section.id_rubrique, (max_articles_export))
|
||||
for i, article in enumerate(articles):
|
||||
articledir = write_article(i, len(articles), article, sectiondir)
|
||||
# Add article to unknown_chars_articles if needed
|
||||
@ -218,7 +222,7 @@ if __name__ == "__main__":
|
||||
# Decrement export limit
|
||||
max_articles_export -= 1
|
||||
# Loop over article’s related files (images …)
|
||||
documents = section.get_documents()
|
||||
documents = get_documents(article.id_article)
|
||||
for i, document in enumerate(documents):
|
||||
write_document(i, len(documents), document, sectiondir, 2)
|
||||
# Break line when finished exporting the section
|
||||
@ -229,4 +233,4 @@ if __name__ == "__main__":
|
||||
for article in unknown_chars_articles:
|
||||
warn_unknown_chars(article)
|
||||
|
||||
db.close() # Close the connection with the database
|
||||
DB.close() # Close the connection with the database
|
||||
|
212
spip2md/spipobjects.py
Normal file
212
spip2md/spipobjects.py
Normal file
@ -0,0 +1,212 @@
|
||||
from os.path import basename, splitext
|
||||
|
||||
from peewee import ModelSelect
|
||||
from slugify import slugify
|
||||
from yaml import dump
|
||||
|
||||
from converters import convert
|
||||
from database import (
|
||||
SpipArticles,
|
||||
SpipAuteurs,
|
||||
SpipAuteursLiens,
|
||||
SpipDocuments,
|
||||
SpipDocumentsLiens,
|
||||
SpipRubriques,
|
||||
)
|
||||
|
||||
EXPORTTYPE: str = "md"
|
||||
|
||||
# Convert images & files links
|
||||
# text: str = convert_documents(
|
||||
# self.texte,
|
||||
# [(d.id, d.titre, d.slug()) for d in self.documents()],
|
||||
# )
|
||||
|
||||
|
||||
class Document(SpipDocuments):
|
||||
class Meta:
|
||||
table_name: str = "spip_documents"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.titre: str = convert(self.titre, True)
|
||||
self.descriptif: str = convert(self.descriptif, True)
|
||||
self.statut: str = "false" if self.statut == "publie" else "true"
|
||||
|
||||
def slug(self, date: bool = False) -> str:
|
||||
name_type: tuple[str, str] = splitext(basename(self.fichier))
|
||||
return (
|
||||
slugify((self.date_publication + "-" if date else "") + name_type[0])
|
||||
+ name_type[1]
|
||||
)
|
||||
|
||||
|
||||
class Article(SpipArticles):
|
||||
class Meta:
|
||||
table_name: str = "spip_articles"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.titre: str = convert(self.titre, True)
|
||||
self.descriptif: str = convert(self.descriptif, True)
|
||||
self.texte: str = convert(self.texte) # Convert SPIP to Markdown
|
||||
self.statut: str = "false" if self.statut == "publie" else "true"
|
||||
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
|
||||
self.extra: str = convert(self.extra) # Probably unused
|
||||
# Article specific
|
||||
self.surtitle: str = convert(self.surtitre, True) # Probably unused
|
||||
self.subtitle: str = convert(self.soustitre, True) # Probably unused
|
||||
self.caption: str = convert(self.chapo) # Probably unused
|
||||
self.ps: str = convert(self.ps) # Probably unused
|
||||
self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
|
||||
|
||||
def slug(self, date: bool = False) -> str:
|
||||
return slugify((self.date + "-" if date else "") + self.titre)
|
||||
|
||||
def filename(self) -> str:
|
||||
return "index" + "." + self.lang + "." + EXPORTTYPE
|
||||
|
||||
def frontmatter(self) -> str:
|
||||
return dump(
|
||||
{
|
||||
"lang": self.lang,
|
||||
"translationKey": self.id_trad,
|
||||
"title": self.titre,
|
||||
"publishDate": self.date,
|
||||
"lastmod": self.maj,
|
||||
"draft": self.statut,
|
||||
"description": self.descriptif,
|
||||
# Debugging
|
||||
"spip_id": self.id_article,
|
||||
"spip_id_secteur": self.id_secteur,
|
||||
# Article specific
|
||||
"surtitle": self.surtitle,
|
||||
"subtitle": self.subtitle,
|
||||
"date": self.date_redac,
|
||||
"authors": [author.nom for author in self.authors()],
|
||||
# Debugging
|
||||
"spip_id_rubrique": self.id_rubrique,
|
||||
"spip_chapo": self.caption,
|
||||
},
|
||||
allow_unicode=True,
|
||||
)
|
||||
|
||||
def body(self) -> str:
|
||||
body: str = ""
|
||||
# Add the title as a Markdown h1
|
||||
if len(self.titre) > 0:
|
||||
body += "\n\n# " + self.titre
|
||||
# If there is a text, add the text preceded by two line breaks
|
||||
if len(self.texte) > 0:
|
||||
# Remove remaining HTML after & append to body
|
||||
body += "\n\n"
|
||||
# Same with an "extra" section
|
||||
if len(self.extra) > 0:
|
||||
body += "\n\n# EXTRA\n\n" + self.extra
|
||||
# If there is a caption, add the caption followed by a hr
|
||||
if hasattr(self, "caption") and len(self.caption) > 0:
|
||||
body += "\n\n" + self.caption + "\n\n***"
|
||||
# PS
|
||||
if hasattr(self, "ps") and len(self.ps) > 0:
|
||||
body += "\n\n# POST-SCRIPTUM\n\n" + self.ps
|
||||
# Microblog
|
||||
if hasattr(self, "microblog") and len(self.microblog) > 0:
|
||||
body += "\n\n# MICROBLOGGING\n\n" + self.microblog
|
||||
return body
|
||||
|
||||
def content(self) -> str:
|
||||
# Return the final article text
|
||||
return "---\n" + self.frontmatter() + "---" + self.body()
|
||||
|
||||
def authors(self) -> list[SpipAuteurs]:
|
||||
return (
|
||||
SpipAuteurs.select()
|
||||
.join(
|
||||
SpipAuteursLiens,
|
||||
on=(SpipAuteurs.id_auteur == SpipAuteursLiens.id_auteur),
|
||||
)
|
||||
.where(SpipAuteursLiens.id_objet == self.id_article)
|
||||
)
|
||||
|
||||
|
||||
class Rubrique(SpipRubriques):
|
||||
class Meta:
|
||||
table_name: str = "spip_rubriques"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.titre: str = convert(self.titre, True)
|
||||
self.descriptif: str = convert(self.descriptif, True)
|
||||
self.texte: str = convert(self.texte) # Convert SPIP to Markdown
|
||||
self.statut: str = "false" if self.statut == "publie" else "true"
|
||||
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
|
||||
self.extra: str = convert(self.extra) # Probably unused
|
||||
|
||||
def slug(self, date: bool = False) -> str:
|
||||
return slugify((self.date + "-" if date else "") + self.titre)
|
||||
|
||||
def filename(self) -> str:
|
||||
return "index" + "." + self.lang + "." + EXPORTTYPE
|
||||
|
||||
def frontmatter(self) -> str:
|
||||
return dump(
|
||||
{
|
||||
"lang": self.lang,
|
||||
"translationKey": self.id_trad,
|
||||
"title": self.titre,
|
||||
"publishDate": self.date,
|
||||
"lastmod": self.maj,
|
||||
"draft": self.statut,
|
||||
"description": self.descriptif,
|
||||
# Debugging
|
||||
"spip_id": self.id_rubrique,
|
||||
"spip_id_secteur": self.id_secteur,
|
||||
},
|
||||
allow_unicode=True,
|
||||
)
|
||||
|
||||
def body(self) -> str:
|
||||
body: str = ""
|
||||
# Add the title as a Markdown h1
|
||||
if len(self.titre) > 0:
|
||||
body += "\n\n# " + self.titre
|
||||
# If there is a text, add the text preceded by two line breaks
|
||||
if len(self.texte) > 0:
|
||||
# Remove remaining HTML after & append to body
|
||||
body += "\n\n"
|
||||
# Same with an "extra" section
|
||||
if len(self.extra) > 0:
|
||||
body += "\n\n# EXTRA\n\n" + self.extra
|
||||
return body
|
||||
|
||||
def content(self) -> str:
|
||||
# Return the final article text
|
||||
return "---\n" + self.frontmatter() + "---" + self.body()
|
||||
|
||||
|
||||
# Query the DB to retrieve all sections sorted by publication date
|
||||
def get_sections(limit: int = 10**6) -> ModelSelect:
|
||||
return Rubrique.select().order_by(Rubrique.date.desc()).limit(limit)
|
||||
|
||||
|
||||
# Query the DB to retrieve all articles sorted by publication date
|
||||
def get_articles(section_id: int, limit: int = 10**6) -> ModelSelect:
|
||||
return (
|
||||
Article.select()
|
||||
.where(Article.id_rubrique == section_id)
|
||||
.order_by(Article.date.desc())
|
||||
.limit(limit)
|
||||
)
|
||||
|
||||
|
||||
# Query the DB to retrieve all documents related to object of id object_id
|
||||
def get_documents(object_id: int, limit: int = 10**6) -> ModelSelect:
|
||||
return (
|
||||
Document.select()
|
||||
.join(
|
||||
SpipDocumentsLiens,
|
||||
on=(Document.id_document == SpipDocumentsLiens.id_document),
|
||||
)
|
||||
.where(SpipDocumentsLiens.id_objet == object_id)
|
||||
.limit(limit)
|
||||
)
|
Loading…
Reference in New Issue
Block a user