separation between classes, functions & regex mappings

This commit is contained in:
Guilhem Fauré 2023-05-26 14:43:39 +02:00
parent 5e7740a414
commit 32738a9269
3 changed files with 91 additions and 99 deletions

View File

@ -9,11 +9,7 @@ from peewee import ModelSelect
from spip2md.config import CFG
from spip2md.database import DB
from spip2md.regexmap import unknown_chars, unknown_chars_context
from spip2md.spipobjects import (
Article,
Rubrique,
)
from spip2md.spipobjects import Rubrique
# Define styles
BOLD = 1 # Bold
@ -60,13 +56,7 @@ def root_sections(limit: int = 10**3) -> ModelSelect:
.limit(limit)
)
def has_unknown_chars(article: Article) -> bool:
if len(unknown_chars_context(article.texte)) > 0:
return True
return False
r"""
# Print the detected unknown chars in article in their context but highlighted
def warn_unknown_chars(article: Article) -> None:
# Print the title of the article in which there is unknown characters
@ -85,6 +75,7 @@ def warn_unknown_chars(article: Article) -> None:
highlight(text, *unknown_chars(text))
style("\n")
print() # Break line
"""
# Print one root section list output correctly
@ -106,16 +97,16 @@ DB.connect()
def main(*argv):
if len(argv) == 0:
argv = sys.argv
# Define max nb of articles to export based on first CLI argument
# Define max nb of sections to export based on first CLI argument TODO
if len(argv) >= 2:
articles_export = int(argv[1])
else:
articles_export = CFG.max_articles_export
# Define max nb of sections to export based on second CLI argument
if len(argv) >= 3:
sections_export = int(argv[2])
sections_export = int(argv[1])
else:
sections_export = CFG.max_sections_export
# Define max nb of articles to export based on second CLI argument TODO
# if len(argv) >= 3:
# articles_export = int(argv[2])
# else:
# articles_export = CFG.max_articles_export
# Clear the output dir & create a new
if CFG.clear_output:

View File

@ -1,10 +1,9 @@
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
# pyright: strict
from re import I, S, compile, finditer, sub
from typing import Optional
from re import I, S, compile
# SPIP syntax to Markdown
SPIP_TO_MARKDOWN = (
# ((SPIP syntax, Replacement Markdown syntax), …)
SPIP_MARKDOWN = (
( # horizontal rule
compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I),
# r"---",
@ -43,15 +42,15 @@ SPIP_TO_MARKDOWN = (
),
( # images
compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I),
r"![](\1\2)",
r"![](\2)", # Needs to be further processed to replace ID with filename
),
( # documents & embeds
compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I),
r"[](\1\2)",
r"[](\2)", # Needs to be further processed to replace ID with filename
),
( # internal links
compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I),
r"[](\1\2)",
r"[](\2)", # Needs to be further processed to replace ID with filename
),
( # anchor
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
@ -106,20 +105,31 @@ SPIP_TO_MARKDOWN = (
),
"```\n\\1\n\n```",
),
( # WARNING remove every html tag
compile(r"<\/?.*?>\s*", S | I),
r"",
),
)
# Further cleaning for metadata texts such as titles or descriptions
SPIP_META_BLOAT = (
# Match against documents ID found in links, ID can be inserted with .format()
# Name and path can be further replaced with .format()
DOCUMENT_LINK = r"(!)?\[(.*?)\]\(({})\)"
DOCUMENT_LINK_REPL = r"\1[\2{}]({})"
# Multi language block, capture groups: (lang, text, lang, text, …)
MULTILANG = compile(
r"<multi>(?:\s*\[(.{2,6})\]\s*(.*?)\s*)+<\/multi>",
S | I,
)
# WARNING probably useless text in metadata fields, to be removed
BLOAT = (
compile(r"^>+ +", S | I), # Remove beginning with angle bracket(s)
compile(r"^\d+\. +", S | I), # Remove beginning with a number followed by a dot
)
# Broken ISO encoding to proper UTF-8
ISO_TO_UTF = (
# Matches against every HTML tag
HTMLTAG = compile(r"<\/?.*?>\s*", S | I)
# ((Broken ISO 8859-1 encoding, Proper UTF equivalent encoding), …)
ISO_UTF = (
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
"’",
r"",
@ -224,7 +234,7 @@ ISO_TO_UTF = (
"\u0081",
r"í",
),
# WARNING not sure
# WARNING not sure below
( # Fix UTF-8 é that was interpreted as ISO 8859-1
"",
r"é",
@ -239,62 +249,22 @@ ISO_TO_UTF = (
),
)
# WARNING unknown broken encoding
# WARNING broken ISO 8859-1 encoding which I dont know the UTF equivalent
UNKNOWN_ISO = (
r"
",
r"∆",
)
# Multi language block, capture the first
MULTILINGUAL = compile(
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
S | I,
"
",
"∆",
)
# Apply SPIP to Markdown & ISO to UTF conversions to a text, & eventually clean meta
def convert(text: Optional[str], clean_meta: bool = False) -> str:
if text is None:
return ""
for spip, markdown in SPIP_TO_MARKDOWN:
text = spip.sub(markdown, text)
if clean_meta:
for bloat in SPIP_META_BLOAT:
text = bloat.sub("", text)
for iso, utf in ISO_TO_UTF:
text = text.replace(iso, utf)
return text
# Replace images & files links in Markdown with real slugs of the actually linked files
def link_document(text: str, id: int, name: str, slug: str) -> str:
# Replace images that dont have a title written in text
text = sub(
r"!\[]\((?:img|image)" + str(id) + r"(\|.*?)*\)",
f"![{name}]({slug})",
text,
)
# Replace images that dont have a title written in text
text = sub(
r"\[]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)",
f"[{name}]({slug})",
text,
)
# Replace images that already had a title in Markdown style link
text = sub(
r"!\[(.+?)\]\((?:img|image)" + str(id) + r"(\|.*?)*\)",
f"![\\1]({slug})",
text,
)
# Replace documents that already had a title in Markdown style link
text = sub(
r"\[(.+?)\]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)",
f"[\\1]({slug})",
text,
)
return text
# Special elements in terminal output to surround
SPECIAL_OUTPUT = (
(compile(r"^([0-9]+?\.)(?= )"), r"{}\1{}"), # Counter
(compile(r"(?<= )->(?= )"), r"{}->{}"), # Arrow
(compile(r"(?<=^Exporting )([0-9]+?)(?= )"), r"{}\1{}"), # Total
)
r"""
# Return a list of tuples giving the start and end of unknown substring in text
def unknown_chars(text: str) -> list[tuple[int, int]]:
positions: list[tuple[int, int]] = []
@ -303,7 +273,6 @@ def unknown_chars(text: str) -> list[tuple[int, int]]:
positions.append((match.start(), match.end()))
return positions
# Return strings with unknown chards found in text, surrounded by context_length chars
def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
errors: list[str] = []
@ -316,3 +285,4 @@ def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
for match in matches:
errors.append(match.group())
return errors
"""

View File

@ -1,7 +1,7 @@
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
from os import makedirs
from os.path import basename, splitext
from re import finditer
from re import finditer, sub
from shutil import copyfile
from typing import Any, Optional
@ -18,7 +18,16 @@ from spip2md.database import (
SpipDocumentsLiens,
SpipRubriques,
)
from spip2md.regexmap import convert, link_document, unknown_chars
from spip2md.regexmap import (
BLOAT,
DOCUMENT_LINK,
DOCUMENT_LINK_REPL,
HTMLTAG,
ISO_UTF,
MULTILANG,
SPIP_MARKDOWN,
UNKNOWN_ISO,
)
class SpipWritable:
@ -26,6 +35,7 @@ class SpipWritable:
texte: str
lang: str
titre: str
profondeur: int
def filename(self, date: bool = False) -> str:
raise NotImplementedError(
@ -49,6 +59,19 @@ class SpipWritable:
output[-1] += "MISSING NAME"
return output
# Apply different mappings to text fields, like SPIP to Markdown or encoding
def convert_attrs(self, *attrs: str) -> None:
attrs += "titre", "descriptif"
for attr in attrs:
a = getattr(self, attr)
if len(a) > 0:
for spip, markdown in SPIP_MARKDOWN:
setattr(self, attr, spip.sub(markdown, a))
for bloat in BLOAT:
setattr(self, attr, bloat.sub("", a))
for iso, utf in ISO_UTF:
setattr(self, attr, a.replace(iso, utf))
# Write object to output destination
def write(self, parent_dir: str) -> str:
raise NotImplementedError(
@ -69,8 +92,6 @@ class Document(SpipWritable, SpipDocuments):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.titre: str = convert(self.titre, True)
self.descriptif: str = convert(self.descriptif, True)
self.statut: str = "false" if self.statut == "publie" else "true"
# Get slugified name of this file
@ -86,6 +107,8 @@ class Document(SpipWritable, SpipDocuments):
# Write document to output destination
def write(self, parent_dir: str) -> str:
# Apply needed conversions
super().convert_attrs()
# Define file source and destination
src: str = CFG.data_dir + self.fichier
dest: str = parent_dir + self.filename()
@ -100,23 +123,25 @@ class SpipObject(SpipWritable):
date: DateTimeField
maj: str
id_secteur: int
descriptif: str
extra: str
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Common fields that need conversions
self.titre: str = convert(self.titre, True)
self.descriptif: str = convert(self.descriptif, True)
self.texte: str = convert(self.texte) # Convert SPIP to Markdown
self.statut: str = "false" if self.statut == "publie" else "true"
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
self.extra: str = convert(self.extra) # Probably unused
# Define file prefix (needs to be redefined for sections)
self.prefix = "index"
# Convert SPIP style internal links for images & other files into Markdown style
def link_documents(self, documents: ModelSelect) -> None:
for d in documents:
self.texte = link_document(self.texte, d.id_document, d.titre, d.filename())
self.texte = sub(
DOCUMENT_LINK.format(d.id_document),
DOCUMENT_LINK_REPL.format(d.titre, d.filename()),
self.texte,
)
# Output related documents & link them in the text by the way
def documents(self, link_documents: bool = True) -> ModelSelect:
@ -198,8 +223,13 @@ class SpipObject(SpipWritable):
body += "\n\n# EXTRA\n\n" + self.extra
return body
def convert_attrs(self, *attrs: str) -> None:
return super().convert_attrs(*attrs, "descriptif", "extra")
# Write object to output destination
def write(self, parent_dir: str) -> str:
# Apply needed conversions
super().convert_attrs()
# Define actual export directory
directory: str = parent_dir + self.dir_slug()
# Make a directory for this object if there isnt
@ -219,14 +249,15 @@ class Article(SpipObject, SpipArticles):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# More conversions needed for articles
self.surtitre: str = convert(self.surtitre, True) # Probably unused
self.soustitre: str = convert(self.soustitre, True) # Probably unused
self.chapo: str = convert(self.chapo) # Probably unused
self.ps: str = convert(self.ps) # Probably unused
self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
# ID
self.object_id = self.id_article
def convert_attrs(self, *attrs: str) -> None:
return super().convert_attrs(
*attrs, "surtitre", "soustitre", "chapo", "ps", "accepter_forum"
)
def frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
meta: dict[str, Any] = {
# Article specific