diff --git a/spip2md/__init__.py b/spip2md/__init__.py
index 3c50d5e..f67b322 100644
--- a/spip2md/__init__.py
+++ b/spip2md/__init__.py
@@ -9,11 +9,7 @@ from peewee import ModelSelect
from spip2md.config import CFG
from spip2md.database import DB
-from spip2md.regexmap import unknown_chars, unknown_chars_context
-from spip2md.spipobjects import (
- Article,
- Rubrique,
-)
+from spip2md.spipobjects import Rubrique
# Define styles
BOLD = 1 # Bold
@@ -60,13 +56,7 @@ def root_sections(limit: int = 10**3) -> ModelSelect:
.limit(limit)
)
-
-def has_unknown_chars(article: Article) -> bool:
- if len(unknown_chars_context(article.texte)) > 0:
- return True
- return False
-
-
+r"""
# Print the detected unknown chars in article in their context but highlighted
def warn_unknown_chars(article: Article) -> None:
# Print the title of the article in which there is unknown characters
@@ -85,6 +75,7 @@ def warn_unknown_chars(article: Article) -> None:
highlight(text, *unknown_chars(text))
style(" … \n")
print() # Break line
+"""
# Print one root section list output correctly
@@ -106,16 +97,16 @@ DB.connect()
def main(*argv):
if len(argv) == 0:
argv = sys.argv
- # Define max nb of articles to export based on first CLI argument
+ # Define max nb of sections to export based on first CLI argument TODO
if len(argv) >= 2:
- articles_export = int(argv[1])
- else:
- articles_export = CFG.max_articles_export
- # Define max nb of sections to export based on second CLI argument
- if len(argv) >= 3:
- sections_export = int(argv[2])
+ sections_export = int(argv[1])
else:
sections_export = CFG.max_sections_export
+ # Define max nb of articles to export based on second CLI argument TODO
+ # if len(argv) >= 3:
+ # articles_export = int(argv[2])
+ # else:
+ # articles_export = CFG.max_articles_export
# Clear the output dir & create a new
if CFG.clear_output:
diff --git a/spip2md/regexmap.py b/spip2md/regexmap.py
index 5222286..314f0e1 100644
--- a/spip2md/regexmap.py
+++ b/spip2md/regexmap.py
@@ -1,10 +1,9 @@
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
# pyright: strict
-from re import I, S, compile, finditer, sub
-from typing import Optional
+from re import I, S, compile
-# SPIP syntax to Markdown
-SPIP_TO_MARKDOWN = (
+# ((SPIP syntax, Replacement Markdown syntax), …)
+SPIP_MARKDOWN = (
( # horizontal rule
compile(r"- ?- ?- ?- ?[\- ]*|
", S | I),
# r"---",
@@ -43,15 +42,15 @@ SPIP_TO_MARKDOWN = (
),
( # images
compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I),
- r"![](\1\2)",
+ r"![](\2)", # Needs to be further processed to replace ID with filename
),
( # documents & embeds
compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I),
- r"[](\1\2)",
+ r"[](\2)", # Needs to be further processed to replace ID with filename
),
( # internal links
compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I),
- r"[](\1\2)",
+ r"[](\2)", # Needs to be further processed to replace ID with filename
),
( # anchor
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
@@ -106,20 +105,31 @@ SPIP_TO_MARKDOWN = (
),
"```\n\\1\n\n```",
),
- ( # WARNING remove every html tag
- compile(r"<\/?.*?>\s*", S | I),
- r"",
- ),
)
-# Further cleaning for metadata texts such as titles or descriptions
-SPIP_META_BLOAT = (
+# Match against documents ID found in links, ID can be inserted with .format()
+# Name and path can be further replaced with .format()
+DOCUMENT_LINK = r"(!)?\[(.*?)\]\(({})\)"
+DOCUMENT_LINK_REPL = r"\1[\2{}]({})"
+
+# Multi language block, capture groups: (lang, text, lang, text, …)
+MULTILANG = compile(
+ r"(?:\s*\[(.{2,6})\]\s*(.*?)\s*)+<\/multi>",
+ S | I,
+)
+
+# WARNING probably useless text in metadata fields, to be removed
+BLOAT = (
compile(r"^>+ +", S | I), # Remove beginning with angle bracket(s)
compile(r"^\d+\. +", S | I), # Remove beginning with a number followed by a dot
)
-# Broken ISO encoding to proper UTF-8
-ISO_TO_UTF = (
+# Matches against every HTML tag
+HTMLTAG = compile(r"<\/?.*?>\s*", S | I)
+
+
+# ((Broken ISO 8859-1 encoding, Proper UTF equivalent encoding), …)
+ISO_UTF = (
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
"’",
r"’",
@@ -224,7 +234,7 @@ ISO_TO_UTF = (
"iÌ\u0081",
r"í",
),
- # WARNING not sure
+ # WARNING not sure below
( # Fix UTF-8 é that was interpreted as ISO 8859-1
"eÌ ",
r"é",
@@ -239,62 +249,22 @@ ISO_TO_UTF = (
),
)
-# WARNING unknown broken encoding
+# WARNING broken ISO 8859-1 encoding which I don’t know the UTF equivalent
UNKNOWN_ISO = (
- r"
",
- r"∆",
-)
-
-# Multi language block, capture the first
-MULTILINGUAL = compile(
- r"\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
- S | I,
+ "
",
+ "∆",
)
-# Apply SPIP to Markdown & ISO to UTF conversions to a text, & eventually clean meta
-def convert(text: Optional[str], clean_meta: bool = False) -> str:
- if text is None:
- return ""
- for spip, markdown in SPIP_TO_MARKDOWN:
- text = spip.sub(markdown, text)
- if clean_meta:
- for bloat in SPIP_META_BLOAT:
- text = bloat.sub("", text)
- for iso, utf in ISO_TO_UTF:
- text = text.replace(iso, utf)
- return text
-
-
-# Replace images & files links in Markdown with real slugs of the actually linked files
-def link_document(text: str, id: int, name: str, slug: str) -> str:
- # Replace images that dont have a title written in text
- text = sub(
- r"!\[]\((?:img|image)" + str(id) + r"(\|.*?)*\)",
- f"![{name}]({slug})",
- text,
- )
- # Replace images that dont have a title written in text
- text = sub(
- r"\[]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)",
- f"[{name}]({slug})",
- text,
- )
- # Replace images that already had a title in Markdown style link
- text = sub(
- r"!\[(.+?)\]\((?:img|image)" + str(id) + r"(\|.*?)*\)",
- f"![\\1]({slug})",
- text,
- )
- # Replace documents that already had a title in Markdown style link
- text = sub(
- r"\[(.+?)\]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)",
- f"[\\1]({slug})",
- text,
- )
- return text
+# Special elements in terminal output to surround
+SPECIAL_OUTPUT = (
+ (compile(r"^([0-9]+?\.)(?= )"), r"{}\1{}"), # Counter
+ (compile(r"(?<= )->(?= )"), r"{}->{}"), # Arrow
+ (compile(r"(?<=^Exporting )([0-9]+?)(?= )"), r"{}\1{}"), # Total
+)
+r"""
# Return a list of tuples giving the start and end of unknown substring in text
def unknown_chars(text: str) -> list[tuple[int, int]]:
positions: list[tuple[int, int]] = []
@@ -303,7 +273,6 @@ def unknown_chars(text: str) -> list[tuple[int, int]]:
positions.append((match.start(), match.end()))
return positions
-
# Return strings with unknown chards found in text, surrounded by context_length chars
def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
errors: list[str] = []
@@ -316,3 +285,4 @@ def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
for match in matches:
errors.append(match.group())
return errors
+"""
diff --git a/spip2md/spipobjects.py b/spip2md/spipobjects.py
index a2abf14..ee4ffe5 100644
--- a/spip2md/spipobjects.py
+++ b/spip2md/spipobjects.py
@@ -1,7 +1,7 @@
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
from os import makedirs
from os.path import basename, splitext
-from re import finditer
+from re import finditer, sub
from shutil import copyfile
from typing import Any, Optional
@@ -18,7 +18,16 @@ from spip2md.database import (
SpipDocumentsLiens,
SpipRubriques,
)
-from spip2md.regexmap import convert, link_document, unknown_chars
+from spip2md.regexmap import (
+ BLOAT,
+ DOCUMENT_LINK,
+ DOCUMENT_LINK_REPL,
+ HTMLTAG,
+ ISO_UTF,
+ MULTILANG,
+ SPIP_MARKDOWN,
+ UNKNOWN_ISO,
+)
class SpipWritable:
@@ -26,6 +35,7 @@ class SpipWritable:
texte: str
lang: str
titre: str
+ profondeur: int
def filename(self, date: bool = False) -> str:
raise NotImplementedError(
@@ -49,6 +59,19 @@ class SpipWritable:
output[-1] += "MISSING NAME"
return output
+ # Apply different mappings to text fields, like SPIP to Markdown or encoding
+ def convert_attrs(self, *attrs: str) -> None:
+ attrs += "titre", "descriptif"
+ for attr in attrs:
+ a = getattr(self, attr)
+ if len(a) > 0:
+ for spip, markdown in SPIP_MARKDOWN:
+ setattr(self, attr, spip.sub(markdown, a))
+ for bloat in BLOAT:
+ setattr(self, attr, bloat.sub("", a))
+ for iso, utf in ISO_UTF:
+ setattr(self, attr, a.replace(iso, utf))
+
# Write object to output destination
def write(self, parent_dir: str) -> str:
raise NotImplementedError(
@@ -69,8 +92,6 @@ class Document(SpipWritable, SpipDocuments):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
- self.titre: str = convert(self.titre, True)
- self.descriptif: str = convert(self.descriptif, True)
self.statut: str = "false" if self.statut == "publie" else "true"
# Get slugified name of this file
@@ -86,6 +107,8 @@ class Document(SpipWritable, SpipDocuments):
# Write document to output destination
def write(self, parent_dir: str) -> str:
+ # Apply needed conversions
+ super().convert_attrs()
# Define file source and destination
src: str = CFG.data_dir + self.fichier
dest: str = parent_dir + self.filename()
@@ -100,23 +123,25 @@ class SpipObject(SpipWritable):
date: DateTimeField
maj: str
id_secteur: int
+ descriptif: str
+ extra: str
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Common fields that need conversions
- self.titre: str = convert(self.titre, True)
- self.descriptif: str = convert(self.descriptif, True)
- self.texte: str = convert(self.texte) # Convert SPIP to Markdown
self.statut: str = "false" if self.statut == "publie" else "true"
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
- self.extra: str = convert(self.extra) # Probably unused
# Define file prefix (needs to be redefined for sections)
self.prefix = "index"
# Convert SPIP style internal links for images & other files into Markdown style
def link_documents(self, documents: ModelSelect) -> None:
for d in documents:
- self.texte = link_document(self.texte, d.id_document, d.titre, d.filename())
+ self.texte = sub(
+ DOCUMENT_LINK.format(d.id_document),
+ DOCUMENT_LINK_REPL.format(d.titre, d.filename()),
+ self.texte,
+ )
# Output related documents & link them in the text by the way
def documents(self, link_documents: bool = True) -> ModelSelect:
@@ -198,8 +223,13 @@ class SpipObject(SpipWritable):
body += "\n\n# EXTRA\n\n" + self.extra
return body
+ def convert_attrs(self, *attrs: str) -> None:
+ return super().convert_attrs(*attrs, "descriptif", "extra")
+
# Write object to output destination
def write(self, parent_dir: str) -> str:
+ # Apply needed conversions
+ super().convert_attrs()
# Define actual export directory
directory: str = parent_dir + self.dir_slug()
# Make a directory for this object if there isn’t
@@ -219,14 +249,15 @@ class Article(SpipObject, SpipArticles):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# More conversions needed for articles
- self.surtitre: str = convert(self.surtitre, True) # Probably unused
- self.soustitre: str = convert(self.soustitre, True) # Probably unused
- self.chapo: str = convert(self.chapo) # Probably unused
- self.ps: str = convert(self.ps) # Probably unused
self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
# ID
self.object_id = self.id_article
+ def convert_attrs(self, *attrs: str) -> None:
+ return super().convert_attrs(
+ *attrs, "surtitre", "soustitre", "chapo", "ps", "accepter_forum"
+ )
+
def frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
meta: dict[str, Any] = {
# Article specific