separation between classes, functions & regex mappings

This commit is contained in:
Guilhem Fauré 2023-05-26 14:43:39 +02:00
parent 5e7740a414
commit 32738a9269
3 changed files with 91 additions and 99 deletions

View File

@ -9,11 +9,7 @@ from peewee import ModelSelect
from spip2md.config import CFG from spip2md.config import CFG
from spip2md.database import DB from spip2md.database import DB
from spip2md.regexmap import unknown_chars, unknown_chars_context from spip2md.spipobjects import Rubrique
from spip2md.spipobjects import (
Article,
Rubrique,
)
# Define styles # Define styles
BOLD = 1 # Bold BOLD = 1 # Bold
@ -60,13 +56,7 @@ def root_sections(limit: int = 10**3) -> ModelSelect:
.limit(limit) .limit(limit)
) )
r"""
def has_unknown_chars(article: Article) -> bool:
if len(unknown_chars_context(article.texte)) > 0:
return True
return False
# Print the detected unknown chars in article in their context but highlighted # Print the detected unknown chars in article in their context but highlighted
def warn_unknown_chars(article: Article) -> None: def warn_unknown_chars(article: Article) -> None:
# Print the title of the article in which there is unknown characters # Print the title of the article in which there is unknown characters
@ -85,6 +75,7 @@ def warn_unknown_chars(article: Article) -> None:
highlight(text, *unknown_chars(text)) highlight(text, *unknown_chars(text))
style("\n") style("\n")
print() # Break line print() # Break line
"""
# Print one root section list output correctly # Print one root section list output correctly
@ -106,16 +97,16 @@ DB.connect()
def main(*argv): def main(*argv):
if len(argv) == 0: if len(argv) == 0:
argv = sys.argv argv = sys.argv
# Define max nb of articles to export based on first CLI argument # Define max nb of sections to export based on first CLI argument TODO
if len(argv) >= 2: if len(argv) >= 2:
articles_export = int(argv[1]) sections_export = int(argv[1])
else:
articles_export = CFG.max_articles_export
# Define max nb of sections to export based on second CLI argument
if len(argv) >= 3:
sections_export = int(argv[2])
else: else:
sections_export = CFG.max_sections_export sections_export = CFG.max_sections_export
# Define max nb of articles to export based on second CLI argument TODO
# if len(argv) >= 3:
# articles_export = int(argv[2])
# else:
# articles_export = CFG.max_articles_export
# Clear the output dir & create a new # Clear the output dir & create a new
if CFG.clear_output: if CFG.clear_output:

View File

@ -1,10 +1,9 @@
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
# pyright: strict # pyright: strict
from re import I, S, compile, finditer, sub from re import I, S, compile
from typing import Optional
# SPIP syntax to Markdown # ((SPIP syntax, Replacement Markdown syntax), …)
SPIP_TO_MARKDOWN = ( SPIP_MARKDOWN = (
( # horizontal rule ( # horizontal rule
compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I), compile(r"- ?- ?- ?- ?[\- ]*|<hr ?.*?>", S | I),
# r"---", # r"---",
@ -43,15 +42,15 @@ SPIP_TO_MARKDOWN = (
), ),
( # images ( # images
compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I), compile(r"<(img|image)([0-9]+)(\|.*?)*>", S | I),
r"![](\1\2)", r"![](\2)", # Needs to be further processed to replace ID with filename
), ),
( # documents & embeds ( # documents & embeds
compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I), compile(r"<(doc|document|emb)([0-9]+)(\|.*?)*>", S | I),
r"[](\1\2)", r"[](\2)", # Needs to be further processed to replace ID with filename
), ),
( # internal links ( # internal links
compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I), compile(r"<(art|article)([0-9]+)(\|.*?)*>", S | I),
r"[](\1\2)", r"[](\2)", # Needs to be further processed to replace ID with filename
), ),
( # anchor ( # anchor
compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I), compile(r"\[ *(.*?) *-> *(.*?) *\]", S | I),
@ -106,20 +105,31 @@ SPIP_TO_MARKDOWN = (
), ),
"```\n\\1\n\n```", "```\n\\1\n\n```",
), ),
( # WARNING remove every html tag
compile(r"<\/?.*?>\s*", S | I),
r"",
),
) )
# Further cleaning for metadata texts such as titles or descriptions # Match against documents ID found in links, ID can be inserted with .format()
SPIP_META_BLOAT = ( # Name and path can be further replaced with .format()
DOCUMENT_LINK = r"(!)?\[(.*?)\]\(({})\)"
DOCUMENT_LINK_REPL = r"\1[\2{}]({})"
# Multi language block, capture groups: (lang, text, lang, text, …)
MULTILANG = compile(
r"<multi>(?:\s*\[(.{2,6})\]\s*(.*?)\s*)+<\/multi>",
S | I,
)
# WARNING probably useless text in metadata fields, to be removed
BLOAT = (
compile(r"^>+ +", S | I), # Remove beginning with angle bracket(s) compile(r"^>+ +", S | I), # Remove beginning with angle bracket(s)
compile(r"^\d+\. +", S | I), # Remove beginning with a number followed by a dot compile(r"^\d+\. +", S | I), # Remove beginning with a number followed by a dot
) )
# Broken ISO encoding to proper UTF-8 # Matches against every HTML tag
ISO_TO_UTF = ( HTMLTAG = compile(r"<\/?.*?>\s*", S | I)
# ((Broken ISO 8859-1 encoding, Proper UTF equivalent encoding), …)
ISO_UTF = (
( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1 ( # Fix UTF-8 appostrophe that was interpreted as ISO 8859-1
"’", "’",
r"", r"",
@ -224,7 +234,7 @@ ISO_TO_UTF = (
"\u0081", "\u0081",
r"í", r"í",
), ),
# WARNING not sure # WARNING not sure below
( # Fix UTF-8 é that was interpreted as ISO 8859-1 ( # Fix UTF-8 é that was interpreted as ISO 8859-1
"", "",
r"é", r"é",
@ -239,62 +249,22 @@ ISO_TO_UTF = (
), ),
) )
# WARNING unknown broken encoding # WARNING broken ISO 8859-1 encoding which I dont know the UTF equivalent
UNKNOWN_ISO = ( UNKNOWN_ISO = (
r"
", "
",
r"∆", "∆",
)
# Multi language block, capture the first
MULTILINGUAL = compile(
r"<multi>\s*(?:\[.{2,4}\])?\s*(.*?)\s*(?:\s*\[.{2,4}\].*)*<\/multi>",
S | I,
) )
# Apply SPIP to Markdown & ISO to UTF conversions to a text, & eventually clean meta # Special elements in terminal output to surround
def convert(text: Optional[str], clean_meta: bool = False) -> str: SPECIAL_OUTPUT = (
if text is None: (compile(r"^([0-9]+?\.)(?= )"), r"{}\1{}"), # Counter
return "" (compile(r"(?<= )->(?= )"), r"{}->{}"), # Arrow
for spip, markdown in SPIP_TO_MARKDOWN: (compile(r"(?<=^Exporting )([0-9]+?)(?= )"), r"{}\1{}"), # Total
text = spip.sub(markdown, text) )
if clean_meta:
for bloat in SPIP_META_BLOAT:
text = bloat.sub("", text)
for iso, utf in ISO_TO_UTF:
text = text.replace(iso, utf)
return text
# Replace images & files links in Markdown with real slugs of the actually linked files
def link_document(text: str, id: int, name: str, slug: str) -> str:
# Replace images that dont have a title written in text
text = sub(
r"!\[]\((?:img|image)" + str(id) + r"(\|.*?)*\)",
f"![{name}]({slug})",
text,
)
# Replace images that dont have a title written in text
text = sub(
r"\[]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)",
f"[{name}]({slug})",
text,
)
# Replace images that already had a title in Markdown style link
text = sub(
r"!\[(.+?)\]\((?:img|image)" + str(id) + r"(\|.*?)*\)",
f"![\\1]({slug})",
text,
)
# Replace documents that already had a title in Markdown style link
text = sub(
r"\[(.+?)\]\((?:doc|document|emb)" + str(id) + r"(\|.*?)*\)",
f"[\\1]({slug})",
text,
)
return text
r"""
# Return a list of tuples giving the start and end of unknown substring in text # Return a list of tuples giving the start and end of unknown substring in text
def unknown_chars(text: str) -> list[tuple[int, int]]: def unknown_chars(text: str) -> list[tuple[int, int]]:
positions: list[tuple[int, int]] = [] positions: list[tuple[int, int]] = []
@ -303,7 +273,6 @@ def unknown_chars(text: str) -> list[tuple[int, int]]:
positions.append((match.start(), match.end())) positions.append((match.start(), match.end()))
return positions return positions
# Return strings with unknown chards found in text, surrounded by context_length chars # Return strings with unknown chards found in text, surrounded by context_length chars
def unknown_chars_context(text: str, context_length: int = 24) -> list[str]: def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
errors: list[str] = [] errors: list[str] = []
@ -316,3 +285,4 @@ def unknown_chars_context(text: str, context_length: int = 24) -> list[str]:
for match in matches: for match in matches:
errors.append(match.group()) errors.append(match.group())
return errors return errors
"""

View File

@ -1,7 +1,7 @@
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
from os import makedirs from os import makedirs
from os.path import basename, splitext from os.path import basename, splitext
from re import finditer from re import finditer, sub
from shutil import copyfile from shutil import copyfile
from typing import Any, Optional from typing import Any, Optional
@ -18,7 +18,16 @@ from spip2md.database import (
SpipDocumentsLiens, SpipDocumentsLiens,
SpipRubriques, SpipRubriques,
) )
from spip2md.regexmap import convert, link_document, unknown_chars from spip2md.regexmap import (
BLOAT,
DOCUMENT_LINK,
DOCUMENT_LINK_REPL,
HTMLTAG,
ISO_UTF,
MULTILANG,
SPIP_MARKDOWN,
UNKNOWN_ISO,
)
class SpipWritable: class SpipWritable:
@ -26,6 +35,7 @@ class SpipWritable:
texte: str texte: str
lang: str lang: str
titre: str titre: str
profondeur: int
def filename(self, date: bool = False) -> str: def filename(self, date: bool = False) -> str:
raise NotImplementedError( raise NotImplementedError(
@ -49,6 +59,19 @@ class SpipWritable:
output[-1] += "MISSING NAME" output[-1] += "MISSING NAME"
return output return output
# Apply different mappings to text fields, like SPIP to Markdown or encoding
def convert_attrs(self, *attrs: str) -> None:
attrs += "titre", "descriptif"
for attr in attrs:
a = getattr(self, attr)
if len(a) > 0:
for spip, markdown in SPIP_MARKDOWN:
setattr(self, attr, spip.sub(markdown, a))
for bloat in BLOAT:
setattr(self, attr, bloat.sub("", a))
for iso, utf in ISO_UTF:
setattr(self, attr, a.replace(iso, utf))
# Write object to output destination # Write object to output destination
def write(self, parent_dir: str) -> str: def write(self, parent_dir: str) -> str:
raise NotImplementedError( raise NotImplementedError(
@ -69,8 +92,6 @@ class Document(SpipWritable, SpipDocuments):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.titre: str = convert(self.titre, True)
self.descriptif: str = convert(self.descriptif, True)
self.statut: str = "false" if self.statut == "publie" else "true" self.statut: str = "false" if self.statut == "publie" else "true"
# Get slugified name of this file # Get slugified name of this file
@ -86,6 +107,8 @@ class Document(SpipWritable, SpipDocuments):
# Write document to output destination # Write document to output destination
def write(self, parent_dir: str) -> str: def write(self, parent_dir: str) -> str:
# Apply needed conversions
super().convert_attrs()
# Define file source and destination # Define file source and destination
src: str = CFG.data_dir + self.fichier src: str = CFG.data_dir + self.fichier
dest: str = parent_dir + self.filename() dest: str = parent_dir + self.filename()
@ -100,23 +123,25 @@ class SpipObject(SpipWritable):
date: DateTimeField date: DateTimeField
maj: str maj: str
id_secteur: int id_secteur: int
descriptif: str
extra: str
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
# Common fields that need conversions # Common fields that need conversions
self.titre: str = convert(self.titre, True)
self.descriptif: str = convert(self.descriptif, True)
self.texte: str = convert(self.texte) # Convert SPIP to Markdown
self.statut: str = "false" if self.statut == "publie" else "true" self.statut: str = "false" if self.statut == "publie" else "true"
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true" self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
self.extra: str = convert(self.extra) # Probably unused
# Define file prefix (needs to be redefined for sections) # Define file prefix (needs to be redefined for sections)
self.prefix = "index" self.prefix = "index"
# Convert SPIP style internal links for images & other files into Markdown style # Convert SPIP style internal links for images & other files into Markdown style
def link_documents(self, documents: ModelSelect) -> None: def link_documents(self, documents: ModelSelect) -> None:
for d in documents: for d in documents:
self.texte = link_document(self.texte, d.id_document, d.titre, d.filename()) self.texte = sub(
DOCUMENT_LINK.format(d.id_document),
DOCUMENT_LINK_REPL.format(d.titre, d.filename()),
self.texte,
)
# Output related documents & link them in the text by the way # Output related documents & link them in the text by the way
def documents(self, link_documents: bool = True) -> ModelSelect: def documents(self, link_documents: bool = True) -> ModelSelect:
@ -198,8 +223,13 @@ class SpipObject(SpipWritable):
body += "\n\n# EXTRA\n\n" + self.extra body += "\n\n# EXTRA\n\n" + self.extra
return body return body
def convert_attrs(self, *attrs: str) -> None:
return super().convert_attrs(*attrs, "descriptif", "extra")
# Write object to output destination # Write object to output destination
def write(self, parent_dir: str) -> str: def write(self, parent_dir: str) -> str:
# Apply needed conversions
super().convert_attrs()
# Define actual export directory # Define actual export directory
directory: str = parent_dir + self.dir_slug() directory: str = parent_dir + self.dir_slug()
# Make a directory for this object if there isnt # Make a directory for this object if there isnt
@ -219,14 +249,15 @@ class Article(SpipObject, SpipArticles):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
# More conversions needed for articles # More conversions needed for articles
self.surtitre: str = convert(self.surtitre, True) # Probably unused
self.soustitre: str = convert(self.soustitre, True) # Probably unused
self.chapo: str = convert(self.chapo) # Probably unused
self.ps: str = convert(self.ps) # Probably unused
self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false" self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
# ID # ID
self.object_id = self.id_article self.object_id = self.id_article
def convert_attrs(self, *attrs: str) -> None:
return super().convert_attrs(
*attrs, "surtitre", "soustitre", "chapo", "ps", "accepter_forum"
)
def frontmatter(self, append: Optional[dict[str, Any]] = None) -> str: def frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
meta: dict[str, Any] = { meta: dict[str, Any] = {
# Article specific # Article specific