separation between classes, functions & regex mappings
This commit is contained in:
parent
5e7740a414
commit
32738a9269
@ -9,11 +9,7 @@ from peewee import ModelSelect
|
|||||||
|
|
||||||
from spip2md.config import CFG
|
from spip2md.config import CFG
|
||||||
from spip2md.database import DB
|
from spip2md.database import DB
|
||||||
from spip2md.regexmap import unknown_chars, unknown_chars_context
|
from spip2md.spipobjects import Rubrique
|
||||||
from spip2md.spipobjects import (
|
|
||||||
Article,
|
|
||||||
Rubrique,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Define styles
|
# Define styles
|
||||||
BOLD = 1 # Bold
|
BOLD = 1 # Bold
|
||||||
@ -60,13 +56,7 @@ def root_sections(limit: int = 10**3) -> ModelSelect:
|
|||||||
.limit(limit)
|
.limit(limit)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
r"""
|
||||||
def has_unknown_chars(article: Article) -> bool:
|
|
||||||
if len(unknown_chars_context(article.texte)) > 0:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
# Print the detected unknown chars in article in their context but highlighted
|
# Print the detected unknown chars in article in their context but highlighted
|
||||||
def warn_unknown_chars(article: Article) -> None:
|
def warn_unknown_chars(article: Article) -> None:
|
||||||
# Print the title of the article in which there is unknown characters
|
# Print the title of the article in which there is unknown characters
|
||||||
@ -85,6 +75,7 @@ def warn_unknown_chars(article: Article) -> None:
|
|||||||
highlight(text, *unknown_chars(text))
|
highlight(text, *unknown_chars(text))
|
||||||
style(" … \n")
|
style(" … \n")
|
||||||
print() # Break line
|
print() # Break line
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
# Print one root section list output correctly
|
# Print one root section list output correctly
|
||||||
@ -106,16 +97,16 @@ DB.connect()
|
|||||||
def main(*argv):
|
def main(*argv):
|
||||||
if len(argv) == 0:
|
if len(argv) == 0:
|
||||||
argv = sys.argv
|
argv = sys.argv
|
||||||
# Define max nb of articles to export based on first CLI argument
|
# Define max nb of sections to export based on first CLI argument TODO
|
||||||
if len(argv) >= 2:
|
if len(argv) >= 2:
|
||||||
articles_export = int(argv[1])
|
sections_export = int(argv[1])
|
||||||
else:
|
|
||||||
articles_export = CFG.max_articles_export
|
|
||||||
# Define max nb of sections to export based on second CLI argument
|
|
||||||
if len(argv) >= 3:
|
|
||||||
sections_export = int(argv[2])
|
|
||||||
else:
|
else:
|
||||||
sections_export = CFG.max_sections_export
|
sections_export = CFG.max_sections_export
|
||||||
|
# Define max nb of articles to export based on second CLI argument TODO
|
||||||
|
# if len(argv) >= 3:
|
||||||
|
# articles_export = int(argv[2])
|
||||||
|
# else:
|
||||||
|
# articles_export = CFG.max_articles_export
|
||||||
|
|
||||||
# Clear the output dir & create a new
|
# Clear the output dir & create a new
|
||||||
if CFG.clear_output:
|
if CFG.clear_output:
|
||||||
|
@ -1,10 +1,9 @@
|
|||||||
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
|
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
|
||||||
# pyright: strict
|
# pyright: strict
|
||||||
from re import I, S, compile, finditer, sub
|
from re import I, S, compile
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
# SPIP syntax to Markdown
|
# ((SPIP syntax, Replacement Markdown syntax), …)
|
||||||
SPIP_TO_MARKDOWN = (
|
SPIP_MARKDOWN = (
|
||||||
( # horizontal rule
|
( # horizontal rule
|
||||||
compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I),
|
compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I),
|
||||||
# r"---",
|
# r"---",
|
||||||
@ -43,15 +42,15 @@ SPIP_TO_MARKDOWN = (
|
|||||||
),
|
),
|
||||||
( # images
|
( # images
|
||||||
compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I),
|
compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I),
|
||||||
r"![](\1\2)",
|
r"![](\2)", # Needs to be further processed to replace ID with filename
|
||||||
),
|
),
|
||||||
( # documents & embeds
|
( # documents & embeds
|
||||||
compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I),
|
compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I),
|
||||||
r"[](\1\2)",
|
r"[](\2)", # Needs to be further processed to replace ID with filename
|
||||||
),
|
),
|
||||||
( # internal links
|
( # internal links
|
||||||
compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I),
|
compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I),
|
||||||
r"[](\1\2)",
|
r"[](\2)", # Needs to be further processed to replace ID with filename
|
||||||
),
|
),
|
||||||
( # anchor
|
( # anchor
|
||||||
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
|
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
|
||||||
@ -106,20 +105,31 @@ SPIP_TO_MARKDOWN = (
|
|||||||
),
|
),
|
||||||
"```\n\\1\n\n```",
|
"```\n\\1\n\n```",
|
||||||
),
|
),
|
||||||
( # WARNING remove every html tag
|
|
||||||
compile(r"<\/?.*?>\s*", S | I),
|
|
||||||
r"",
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Further cleaning for metadata texts such as titles or descriptions
|
# Match against documents ID found in links, ID can be inserted with .format()
|
||||||
SPIP_META_BLOAT = (
|
# Name and path can be further replaced with .format()
|
||||||
|
DOCUMENT_LINK = r"(!)?\[(.*?)\]\(({})\)"
|
||||||
|
DOCUMENT_LINK_REPL = r"\1[\2{}]({})"
|
||||||
|
|
||||||
|
# Multi language block, capture groups: (lang, text, lang, text, …)
|
||||||
|
MULTILANG = compile(
|
||||||
|
r"<multi>(?:\s*\[(.{2,6})\]\s*(.*?)\s*)+<\/multi>",
|
||||||
|
S | I,
|
||||||
|
)
|
||||||
|
|
||||||
|
# WARNING probably useless text in metadata fields, to be removed
|
||||||
|
BLOAT = (
|
||||||
compile(r"^>+ +", S | I), # Remove beginning with angle bracket(s)
|
compile(r"^>+ +", S | I), # Remove beginning with angle bracket(s)
|
||||||
compile(r"^\d+\. +", S | I), # Remove beginning with a number followed by a dot
|
compile(r"^\d+\. +", S | I), # Remove beginning with a number followed by a dot
|
||||||
)
|
)
|
||||||
|
|
||||||
# Broken ISO encoding to proper UTF-8
|
# Matches against every HTML tag
|
||||||
ISO_TO_UTF = (
|
HTMLTAG = compile(r"<\/?.*?>\s*", S | I)
|
||||||
|
|
||||||
|
|
||||||
|
# ((Broken ISO 8859-1 encoding, Proper UTF equivalent encoding), …)
|
||||||
|
ISO_UTF = (
|
||||||
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
|
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
|
||||||
"’",
|
"’",
|
||||||
r"’",
|
r"’",
|
||||||
@ -224,7 +234,7 @@ ISO_TO_UTF = (
|
|||||||
"iÌ\u0081",
|
"iÌ\u0081",
|
||||||
r"í",
|
r"í",
|
||||||
),
|
),
|
||||||
# WARNING not sure
|
# WARNING not sure below
|
||||||
( # Fix UTF-8 é that was interpreted as ISO 8859-1
|
( # Fix UTF-8 é that was interpreted as ISO 8859-1
|
||||||
"eÌ ",
|
"eÌ ",
|
||||||
r"é",
|
r"é",
|
||||||
@ -239,62 +249,22 @@ ISO_TO_UTF = (
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
# WARNING unknown broken encoding
|
# WARNING broken ISO 8859-1 encoding which I don’t know the UTF equivalent
|
||||||
UNKNOWN_ISO = (
|
UNKNOWN_ISO = (
|
||||||
r"
",
|
"
",
|
||||||
r"∆",
|
"∆",
|
||||||
)
|
|
||||||
|
|
||||||
# Multi language block, capture the first
|
|
||||||
MULTILINGUAL = compile(
|
|
||||||
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
|
|
||||||
S | I,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# Apply SPIP to Markdown & ISO to UTF conversions to a text, & eventually clean meta
|
# Special elements in terminal output to surround
|
||||||
def convert(text: Optional[str], clean_meta: bool = False) -> str:
|
SPECIAL_OUTPUT = (
|
||||||
if text is None:
|
(compile(r"^([0-9]+?\.)(?= )"), r"{}\1{}"), # Counter
|
||||||
return ""
|
(compile(r"(?<= )->(?= )"), r"{}->{}"), # Arrow
|
||||||
for spip, markdown in SPIP_TO_MARKDOWN:
|
(compile(r"(?<=^Exporting )([0-9]+?)(?= )"), r"{}\1{}"), # Total
|
||||||
text = spip.sub(markdown, text)
|
)
|
||||||
if clean_meta:
|
|
||||||
for bloat in SPIP_META_BLOAT:
|
|
||||||
text = bloat.sub("", text)
|
|
||||||
for iso, utf in ISO_TO_UTF:
|
|
||||||
text = text.replace(iso, utf)
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
# Replace images & files links in Markdown with real slugs of the actually linked files
|
|
||||||
def link_document(text: str, id: int, name: str, slug: str) -> str:
|
|
||||||
# Replace images that dont have a title written in text
|
|
||||||
text = sub(
|
|
||||||
r"!\[]\((?:img|image)" + str(id) + r"(\|.*?)*\)",
|
|
||||||
f"![{name}]({slug})",
|
|
||||||
text,
|
|
||||||
)
|
|
||||||
# Replace images that dont have a title written in text
|
|
||||||
text = sub(
|
|
||||||
r"\[]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)",
|
|
||||||
f"[{name}]({slug})",
|
|
||||||
text,
|
|
||||||
)
|
|
||||||
# Replace images that already had a title in Markdown style link
|
|
||||||
text = sub(
|
|
||||||
r"!\[(.+?)\]\((?:img|image)" + str(id) + r"(\|.*?)*\)",
|
|
||||||
f"![\\1]({slug})",
|
|
||||||
text,
|
|
||||||
)
|
|
||||||
# Replace documents that already had a title in Markdown style link
|
|
||||||
text = sub(
|
|
||||||
r"\[(.+?)\]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)",
|
|
||||||
f"[\\1]({slug})",
|
|
||||||
text,
|
|
||||||
)
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
|
r"""
|
||||||
# Return a list of tuples giving the start and end of unknown substring in text
|
# Return a list of tuples giving the start and end of unknown substring in text
|
||||||
def unknown_chars(text: str) -> list[tuple[int, int]]:
|
def unknown_chars(text: str) -> list[tuple[int, int]]:
|
||||||
positions: list[tuple[int, int]] = []
|
positions: list[tuple[int, int]] = []
|
||||||
@ -303,7 +273,6 @@ def unknown_chars(text: str) -> list[tuple[int, int]]:
|
|||||||
positions.append((match.start(), match.end()))
|
positions.append((match.start(), match.end()))
|
||||||
return positions
|
return positions
|
||||||
|
|
||||||
|
|
||||||
# Return strings with unknown chards found in text, surrounded by context_length chars
|
# Return strings with unknown chards found in text, surrounded by context_length chars
|
||||||
def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
|
def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
|
||||||
errors: list[str] = []
|
errors: list[str] = []
|
||||||
@ -316,3 +285,4 @@ def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
|
|||||||
for match in matches:
|
for match in matches:
|
||||||
errors.append(match.group())
|
errors.append(match.group())
|
||||||
return errors
|
return errors
|
||||||
|
"""
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
|
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
|
||||||
from os import makedirs
|
from os import makedirs
|
||||||
from os.path import basename, splitext
|
from os.path import basename, splitext
|
||||||
from re import finditer
|
from re import finditer, sub
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
@ -18,7 +18,16 @@ from spip2md.database import (
|
|||||||
SpipDocumentsLiens,
|
SpipDocumentsLiens,
|
||||||
SpipRubriques,
|
SpipRubriques,
|
||||||
)
|
)
|
||||||
from spip2md.regexmap import convert, link_document, unknown_chars
|
from spip2md.regexmap import (
|
||||||
|
BLOAT,
|
||||||
|
DOCUMENT_LINK,
|
||||||
|
DOCUMENT_LINK_REPL,
|
||||||
|
HTMLTAG,
|
||||||
|
ISO_UTF,
|
||||||
|
MULTILANG,
|
||||||
|
SPIP_MARKDOWN,
|
||||||
|
UNKNOWN_ISO,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class SpipWritable:
|
class SpipWritable:
|
||||||
@ -26,6 +35,7 @@ class SpipWritable:
|
|||||||
texte: str
|
texte: str
|
||||||
lang: str
|
lang: str
|
||||||
titre: str
|
titre: str
|
||||||
|
profondeur: int
|
||||||
|
|
||||||
def filename(self, date: bool = False) -> str:
|
def filename(self, date: bool = False) -> str:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
@ -49,6 +59,19 @@ class SpipWritable:
|
|||||||
output[-1] += "MISSING NAME"
|
output[-1] += "MISSING NAME"
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
# Apply different mappings to text fields, like SPIP to Markdown or encoding
|
||||||
|
def convert_attrs(self, *attrs: str) -> None:
|
||||||
|
attrs += "titre", "descriptif"
|
||||||
|
for attr in attrs:
|
||||||
|
a = getattr(self, attr)
|
||||||
|
if len(a) > 0:
|
||||||
|
for spip, markdown in SPIP_MARKDOWN:
|
||||||
|
setattr(self, attr, spip.sub(markdown, a))
|
||||||
|
for bloat in BLOAT:
|
||||||
|
setattr(self, attr, bloat.sub("", a))
|
||||||
|
for iso, utf in ISO_UTF:
|
||||||
|
setattr(self, attr, a.replace(iso, utf))
|
||||||
|
|
||||||
# Write object to output destination
|
# Write object to output destination
|
||||||
def write(self, parent_dir: str) -> str:
|
def write(self, parent_dir: str) -> str:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
@ -69,8 +92,6 @@ class Document(SpipWritable, SpipDocuments):
|
|||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.titre: str = convert(self.titre, True)
|
|
||||||
self.descriptif: str = convert(self.descriptif, True)
|
|
||||||
self.statut: str = "false" if self.statut == "publie" else "true"
|
self.statut: str = "false" if self.statut == "publie" else "true"
|
||||||
|
|
||||||
# Get slugified name of this file
|
# Get slugified name of this file
|
||||||
@ -86,6 +107,8 @@ class Document(SpipWritable, SpipDocuments):
|
|||||||
|
|
||||||
# Write document to output destination
|
# Write document to output destination
|
||||||
def write(self, parent_dir: str) -> str:
|
def write(self, parent_dir: str) -> str:
|
||||||
|
# Apply needed conversions
|
||||||
|
super().convert_attrs()
|
||||||
# Define file source and destination
|
# Define file source and destination
|
||||||
src: str = CFG.data_dir + self.fichier
|
src: str = CFG.data_dir + self.fichier
|
||||||
dest: str = parent_dir + self.filename()
|
dest: str = parent_dir + self.filename()
|
||||||
@ -100,23 +123,25 @@ class SpipObject(SpipWritable):
|
|||||||
date: DateTimeField
|
date: DateTimeField
|
||||||
maj: str
|
maj: str
|
||||||
id_secteur: int
|
id_secteur: int
|
||||||
|
descriptif: str
|
||||||
|
extra: str
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
# Common fields that need conversions
|
# Common fields that need conversions
|
||||||
self.titre: str = convert(self.titre, True)
|
|
||||||
self.descriptif: str = convert(self.descriptif, True)
|
|
||||||
self.texte: str = convert(self.texte) # Convert SPIP to Markdown
|
|
||||||
self.statut: str = "false" if self.statut == "publie" else "true"
|
self.statut: str = "false" if self.statut == "publie" else "true"
|
||||||
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
|
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
|
||||||
self.extra: str = convert(self.extra) # Probably unused
|
|
||||||
# Define file prefix (needs to be redefined for sections)
|
# Define file prefix (needs to be redefined for sections)
|
||||||
self.prefix = "index"
|
self.prefix = "index"
|
||||||
|
|
||||||
# Convert SPIP style internal links for images & other files into Markdown style
|
# Convert SPIP style internal links for images & other files into Markdown style
|
||||||
def link_documents(self, documents: ModelSelect) -> None:
|
def link_documents(self, documents: ModelSelect) -> None:
|
||||||
for d in documents:
|
for d in documents:
|
||||||
self.texte = link_document(self.texte, d.id_document, d.titre, d.filename())
|
self.texte = sub(
|
||||||
|
DOCUMENT_LINK.format(d.id_document),
|
||||||
|
DOCUMENT_LINK_REPL.format(d.titre, d.filename()),
|
||||||
|
self.texte,
|
||||||
|
)
|
||||||
|
|
||||||
# Output related documents & link them in the text by the way
|
# Output related documents & link them in the text by the way
|
||||||
def documents(self, link_documents: bool = True) -> ModelSelect:
|
def documents(self, link_documents: bool = True) -> ModelSelect:
|
||||||
@ -198,8 +223,13 @@ class SpipObject(SpipWritable):
|
|||||||
body += "\n\n# EXTRA\n\n" + self.extra
|
body += "\n\n# EXTRA\n\n" + self.extra
|
||||||
return body
|
return body
|
||||||
|
|
||||||
|
def convert_attrs(self, *attrs: str) -> None:
|
||||||
|
return super().convert_attrs(*attrs, "descriptif", "extra")
|
||||||
|
|
||||||
# Write object to output destination
|
# Write object to output destination
|
||||||
def write(self, parent_dir: str) -> str:
|
def write(self, parent_dir: str) -> str:
|
||||||
|
# Apply needed conversions
|
||||||
|
super().convert_attrs()
|
||||||
# Define actual export directory
|
# Define actual export directory
|
||||||
directory: str = parent_dir + self.dir_slug()
|
directory: str = parent_dir + self.dir_slug()
|
||||||
# Make a directory for this object if there isn’t
|
# Make a directory for this object if there isn’t
|
||||||
@ -219,14 +249,15 @@ class Article(SpipObject, SpipArticles):
|
|||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
# More conversions needed for articles
|
# More conversions needed for articles
|
||||||
self.surtitre: str = convert(self.surtitre, True) # Probably unused
|
|
||||||
self.soustitre: str = convert(self.soustitre, True) # Probably unused
|
|
||||||
self.chapo: str = convert(self.chapo) # Probably unused
|
|
||||||
self.ps: str = convert(self.ps) # Probably unused
|
|
||||||
self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
|
self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
|
||||||
# ID
|
# ID
|
||||||
self.object_id = self.id_article
|
self.object_id = self.id_article
|
||||||
|
|
||||||
|
def convert_attrs(self, *attrs: str) -> None:
|
||||||
|
return super().convert_attrs(
|
||||||
|
*attrs, "surtitre", "soustitre", "chapo", "ps", "accepter_forum"
|
||||||
|
)
|
||||||
|
|
||||||
def frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
|
def frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
|
||||||
meta: dict[str, Any] = {
|
meta: dict[str, Any] = {
|
||||||
# Article specific
|
# Article specific
|
||||||
|
Loading…
Reference in New Issue
Block a user