modified architecture to translate based on root tree lang

This commit is contained in:
Guilhem Fauré 2023-06-02 17:51:42 +02:00
parent c10c6d81fe
commit cea92e59cb
3 changed files with 174 additions and 197 deletions

View File

@ -2,7 +2,7 @@
import logging
from os import makedirs
from os.path import basename, splitext
from re import Pattern, finditer, search
from re import Match, Pattern, finditer, search
from shutil import copyfile
from typing import Any, Optional
@ -20,11 +20,11 @@ from spip2md.config import CFG
from spip2md.regexmaps import (
ARTICLE_LINK,
BLOAT,
CONFIGLANGS,
DOCUMENT_LINK,
HTMLTAGS,
ISO_UTF,
MULTILANG_BLOCK,
MULTILANGS,
SECTION_LINK,
SPECIAL_OUTPUT,
SPIP_MARKDOWN,
@ -149,7 +149,7 @@ class WritableObject(SpipInterface):
for match in finditer("(" + char + ")+", text):
context: str = unknown_chars_context(text[lastend:], char)
LOG.warn(
f"Unknown char {char} found in {self._title[:40]} at: {context}"
f"Unknown char {char} found in {self.titre[:40]} at: {context}"
)
if CFG.unknown_char_replacement is not None:
LOG.warn(
@ -178,20 +178,16 @@ class WritableObject(SpipInterface):
field = self.warn_unknown(field, UNKNOWN_ISO)
return field.strip() # Strip whitespaces around text
def convert_title(self) -> str:
if self.titre is None:
self._title = "" # Define temporary title to use in functions
else:
self._title = self.titre.strip() # Define temporary title
return self.convert_field(self.titre)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Initialize converted fields beginning with underscore
self._title: str = self.convert_title()
self._description: str = self.convert_field(self.descriptif)
self._status = self.statut == "publie"
# Apply post-init conversions and cancel the export if self not of the right lang
def convert(self) -> None:
self._title = self.convert_field(self.titre)
# Print one or more line(s) in which special elements are stylized
def style_print(
self, string: str, indent: Optional[str] = " ", end: str = "\n"
@ -208,21 +204,20 @@ class WritableObject(SpipInterface):
return stylized
# Print the message telling what is going to be done
def begin_message(
self, index: int, limit: int, prepend: str = "", step: int = 100
) -> list[str]:
def begin_message(self, index: int, limit: int, step: int = 100) -> list[str]:
output: list[str] = []
# Output the remaining number of objects to export every step object
if index % step == 0 and limit > 0:
output.append(f"Exporting {limit-index}")
output[-1] += f" level {self._depth}"
s: str = "s" if limit - index > 1 else ""
if hasattr(self, "lang"):
output[-1] += f" {self.lang}"
output[-1] += f" {type(self).__name__}{s}"
# Print the output as the program goes
self.style_print(output[-1])
# Output the counter & title of the object being exported
output.append(f"{index + 1}. ")
output[-1] += prepend
if len(self._title) == 0:
output[-1] += "EMPTY NAME"
else:
@ -248,18 +243,13 @@ class WritableObject(SpipInterface):
# Perform all the write steps of this object
def write_all(
self,
parentdepth: int,
parentdir: str,
index: int,
total: int,
prepend: str = "",
self, parentdepth: int, parentdir: str, index: int, total: int
) -> RecursiveList:
LOG.debug(f"Writing {type(self).__name__} `{self._title}`")
output: RecursiveList = []
self._depth = parentdepth + 1
self._parentdir = parentdir
for m in self.begin_message(index, total, prepend):
for m in self.begin_message(index, total):
output.append(m)
try:
output[-1] += self.end_message(self.write())
@ -279,26 +269,38 @@ class Document(WritableObject, NormalizedDocument):
return data_dir + self.fichier
# Get directory of this object
def dest_directory(self, prepend: str = "", append: str = "") -> str:
return self._parentdir + prepend + slugify(self._title, max_length=100) + append
def dest_directory(
self, prepend_id: bool = True, prepend: str = "", append: str = ""
) -> str:
_id: str = str(self._id) + "-" if prepend_id else ""
return (
self._parentdir
+ prepend
+ slugify(_id + self._title, max_length=100)
+ append
)
# Get destination slugified name of this file
def dest_filename(self, prepend: str = "", append: str = "") -> str:
name, filetype = splitext(basename(str(self.fichier)))
return slugify(prepend + name, max_length=100) + append + filetype
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Initialize converted fields beginning with underscore
# self._src_path = self.src_path()
# self._dest_directory = self.dest_directory()
# self._dest_filename = self.dest_filename()
# Write document to output destination
def write(self) -> str:
# Copy the document from its SPIP location to the new location
return copyfile(self.src_path(), self.dest_path())
# Perform all the write steps of this object
def write_all(
self, parentdepth: int, parentdir: str, index: int, total: int
) -> RecursiveList:
self.convert() # Apply post-init conversions
return super().write_all(parentdepth, parentdir, index, total)
class LangNotFoundError(Exception):
pass
class RedactionalObject(WritableObject):
id_trad: BigIntegerField | int
@ -312,53 +314,25 @@ class RedactionalObject(WritableObject):
# Converted
_text: str
# Detect every language present in <multi> blocks of text
# For each language in <multi> block in which we want to translate, create
# a new self-similar object in self.translations dict
def translate_multi(self, spipattr: str) -> str:
# Function specific logger
log = logging.getLogger(CFG.logname + ".models.translate_multi")
text: str = getattr(self, spipattr) # Get text of attribute
log.debug(f"Translating <multi> blocks of `{self._title}` `{spipattr}`")
# Handle <multi> multi language blocks
translations: dict[str, str] = {} # Dict such as lang: text
original_translation: str = text
# for each langs of <multi> blocks, add its text to the corresponding dict key
# Get rid of other lang than forced in text and modify lang to forced if found
def translate_field(self, forced_lang: str, text: str) -> str:
LOG.debug(f"Translating <multi> blocks of `{self._title}`")
# for each <multi> blocks, keep only forced lang
lang: Optional[Match[str]] = None
for block in MULTILANG_BLOCK.finditer(text):
for lang in MULTILANGS.finditer(block.group(1)):
# To log the translation
trans: str = lang.group(2)[:50].strip()
if lang.group(1) == self.lang:
log.debug(
f"Discovered {lang.group(1)} translation of `{self._title}`:"
+ f" `{trans}`, keeping it as the original lang"
)
original_translation = original_translation.replace(
block.group(), lang.group(2)
)
elif lang.group(1) in translations:
log.debug(
f"Discovered more {lang.group(1)} translation of"
+ f" `{self._title}`: `{trans}`"
)
translations[lang.group(1)] += lang.group(2)
else:
log.debug(
f"Discovered {lang.group(1)} translation of `{self._title}`: "
+ f" `{trans}`"
)
translations[lang.group(1)] = lang.group(2)
# Iterate over translations, adding translated attributes to translations dict
for lang, translation in translations.items():
if lang in CFG.export_languages:
if lang not in self._translations:
self._translations[lang] = {}
self._translations[lang][spipattr] = translation
log.debug(
f"{lang} `{self._title}` `{spipattr}`"
+ f" set to `{self._translations[lang][spipattr]}`"
lang = CONFIGLANGS[forced_lang].search(block.group(1))
if lang is not None:
# Log the translation
trans: str = lang.group(1)[:50].strip()
LOG.debug(
f"Keeping {forced_lang} translation of `{self._title}`: "
+ f"`{trans}`, becoming its new lang"
)
return original_translation
text = text.replace(block.group(), lang.group(1))
self.lang = forced_lang # So write-all will not be cancelled
if lang is None:
LOG.debug(f"{forced_lang} not found in `{self._title}`")
return text
def replace_links(
self,
@ -387,86 +361,90 @@ class RedactionalObject(WritableObject):
return text
# Get slugified directory of this object
def dest_directory(self, prepend: str = "", append: str = "/") -> str:
return self._parentdir + prepend + slugify(self._title, max_length=100) + append
def dest_directory(
self, prepend_id: bool = True, prepend: str = "", append: str = "/"
) -> str:
_id: str = str(self._id) + "-" if prepend_id else ""
return (
self._parentdir
+ prepend
+ slugify(_id + self._title, max_length=100)
+ append
)
# Get filename of this object
def dest_filename(self) -> str:
return self._fileprefix + "." + self.lang + "." + CFG.export_filetype
def convert_title(self) -> str:
def convert_title(self, forced_lang: str) -> None:
LOG.debug(f"Convert title of currently untitled {type(self).__name__}")
if hasattr(self, "_title"):
LOG.debug(f"{type(self).__name__} {self._title} _title is already set")
return self._title
return
if self.titre is None:
LOG.debug(f"{type(self).__name__} title is None")
return ""
self._title = ""
return
if len(self.titre) == 0:
LOG.debug(f"{type(self).__name__} title is empty")
return ""
self._title = self.titre.strip() # Define temporary title to use in functions
self._title = self.translate_multi("titre")
LOG.debug(
f"`{self.lang}` `{self._title}` title was translated to : `{self._title}`"
)
LOG.debug(f"Convert document links of `{self._title}`")
self._title = ""
return
self._title = self.titre.strip()
self._title = self.translate_field(forced_lang, self._title)
LOG.debug(f"Convert document links of `{self._title}` title")
self._title = self.replace_links(self._title, DOCUMENT_LINK, Document)
LOG.debug(f"Convert article links of `{self._title}`")
LOG.debug(f"Convert article links of `{self._title}` title")
self._title = self.replace_links(self._title, ARTICLE_LINK, Article)
LOG.debug(f"Convert section links of `{self._title}`")
LOG.debug(f"Convert section links of `{self._title}` title")
self._title = self.replace_links(self._title, SECTION_LINK, Section)
return self.convert_field(self._title)
LOG.debug(f"Apply conversions to `{self._title}` title")
self._title = self.convert_field(self._title)
def convert_text(self) -> str:
def convert_text(self, forced_lang: str) -> None:
LOG.debug(f"Convert text of `{self._title}`")
if hasattr(self, "_text"):
LOG.debug(f"{type(self).__name__} {self._title} _text is already set")
return self._text
return
if self.texte is None:
LOG.debug(f"{type(self).__name__} {self._title} text is None")
return ""
self._text = ""
return
if len(self.texte) == 0:
LOG.debug(f"{type(self).__name__} {self._title} text is empty")
return ""
self._text = self.translate_multi("texte")
self._text = ""
return
self._text = self.translate_field(forced_lang, self.texte.strip())
LOG.debug(f"Convert document links of `{self._title}`")
self._text = self.replace_links(self._text, DOCUMENT_LINK, Document)
LOG.debug(f"Convert article links of `{self._title}`")
self._text = self.replace_links(self._text, ARTICLE_LINK, Article)
LOG.debug(f"Convert section links of `{self._title}`")
self._text = self.replace_links(self._text, SECTION_LINK, Section)
return self.convert_field(self._text)
self._text = self.convert_field(self._text)
def convert_extra(self) -> str:
def convert_extra(self) -> None:
LOG.debug(f"Convert extra of `{self._title}`")
if hasattr(self, "_extra"):
LOG.debug(f"{type(self).__name__} {self._title} _extra is already set")
return self._extra
return
if self.extra is None:
LOG.debug(f"{type(self).__name__} {self._title} extra is None")
return ""
self._extra = ""
return
if len(self.extra) == 0:
LOG.debug(f"{type(self).__name__} {self._title} extra is empty")
return ""
self._extra = ""
return
LOG.debug(f"Convert article links of `{self._title}`")
self._extra = self.replace_links(self.extra, ARTICLE_LINK, Article)
LOG.debug(f"Convert section links of `{self._title}`")
self._extra = self.replace_links(self._extra, SECTION_LINK, Section)
return self.convert_field(self._extra)
self._extra = self.convert_field(self._extra)
def __init__(self, *args, **kwargs):
# Initialise translation dict as empty, in the form lang: attr: value
self._translations: dict[str, dict[str, str]] = {}
super().__init__(*args, **kwargs)
# Initialize converted fields beginning with underscore
# Initialize converted fields, beginning with underscore
self._choosen_language = self.langue_choisie == "oui"
self._text = self.convert_text()
self._extra = self.convert_extra()
LOG.debug(
f"After __init__, `{type(self).__name__}` `{self._title}` contains"
+ f" translations: `{self._translations}`"
)
# Get related documents
def documents(self) -> tuple[Document]:
@ -527,76 +505,31 @@ class RedactionalObject(WritableObject):
f.write(self.content())
return self.dest_path()
# Output translated self objects
def translations(self) -> list[Self]:
translations: list[Self] = []
LOG.debug(f"`{self._title}` contains translations: `{self._translations}`")
for lang, translated_attrs in self._translations.items():
# Copy itself (with every attribute) as a base for the translated object
# translation: Self = deepcopy(self)
# Define attributes that new translation will need
attributes: dict[str, Any] = {
"id_trad": self.id_trad,
"titre": self.titre,
"date": self.date,
"maj": self.maj,
"statut": self.statut,
"descriptif": self.descriptif,
"id_secteur": self.id_secteur,
}
# Replace default attributes with translated ones
for attr, value in translated_attrs.items():
attributes[attr] = value
LOG.debug(
f"Instanciating {lang} translation of section `{self._title}`"
+ f" with attributes {attributes}"
# Apply post-init conversions and cancel the export if self not of the right lang
def convert(self, forced_lang: str) -> None:
self.convert_title(forced_lang)
self.convert_text(forced_lang)
self.convert_extra()
if self.lang != forced_lang:
raise LangNotFoundError(
f"`{self._title}` lang is {self.lang} instead of the wanted"
+ f" {forced_lang} and it dont contains"
+ f" {forced_lang} translation in Markup either"
)
translation: Self = type(self)(**attributes)
# Replace important attributes of the translated object
translation.lang = lang
translation._id = self._id # WARNING ?
translation._depth = self._depth # WARNING ?
translation._translations = {}
# Replace the translated attributes of the translated object
# For each translated attribute, output a DEBUG message
for attr in translated_attrs:
LOG.debug(
f"Instanciated {lang} translation of `{self._title}` `{attr}`"
+ f" = `{getattr(translation, attr)}`"
)
translations.append(translation)
return translations
# Get the children of this object
def children(self) -> tuple[tuple[WritableObject], ...]:
return (self.documents(),)
# Write all the children of this object
def write_children(self) -> RecursiveList:
def write_documents(self) -> RecursiveList:
LOG.debug(f"Writing children of {type(self).__name__} `{self._title}`")
output: RecursiveList = []
for children in self.children():
total = len(children)
for i, obj in enumerate(children):
children = self.documents()
total = len(children)
for i, obj in enumerate(children):
try:
output.append(
obj.write_all(self._depth, self.dest_directory(), i, total)
)
return output
# Perform all the write steps of this object
def write_all(
self, parentdepth: int, parentdir: str, index: int, total: int
) -> RecursiveList:
prepend: str = self.lang + ": " if total <= 0 else ""
output: RecursiveList = super().write_all(
parentdepth, parentdir, index, total, prepend
)
output.append(self.write_children())
translations = self.translations()
LOG.debug(f"Initialized translations of {self._title}: {translations}")
for translated in translations:
LOG.debug(f"Writing {translated.lang} translation of {self._title}")
translated.write_all(parentdepth, parentdir, index, 0)
except LangNotFoundError:
pass # For now, do nothing
return output
@ -654,6 +587,15 @@ class Article(RedactionalObject, NormalizedArticle):
.where(SpipAuteursLiens.id_objet == self._id)
)
# Perform all the write steps of this object
def write_all(
self, forced_lang: str, parentdepth: int, parentdir: str, index: int, total: int
) -> RecursiveList:
self.convert(forced_lang)
output: RecursiveList = super().write_all(parentdepth, parentdir, index, total)
output.append(self.write_documents())
return output
class Section(RedactionalObject, NormalizedSection):
class Meta:
@ -671,23 +613,48 @@ class Section(RedactionalObject, NormalizedSection):
return super().frontmatter(meta)
# Get articles of this section
def articles(self) -> tuple[Article]:
def articles(self, limit: int = 10**6) -> tuple[Article]:
LOG.debug(f"Initialize articles of `{self._title}`")
return (
Article.select()
.where((Article.id_rubrique == self._id) & (Article.lang == self.lang))
.where(Article.id_rubrique == self._id)
.order_by(Article.date.desc())
# .limit(limit)
.limit(limit)
)
# Get subsections of this section
def sections(self) -> tuple[Self]:
def sections(self, limit: int = 10**6) -> tuple[Self]:
LOG.debug(f"Initialize subsections of `{self._title}`")
return (
Section.select()
.where(Section.id_parent == self._id)
.order_by(Section.date.desc())
.limit(limit)
)
def children(self) -> tuple[tuple[WritableObject], ...]:
return (self.articles(),) + super().children() + (self.sections(),)
# Write all the children of this object
def write_children(self, forcedlang: str) -> RecursiveList:
super().write_documents()
LOG.debug(f"Writing children of {type(self).__name__} `{self._title}`")
output: RecursiveList = []
for children in (self.articles(), self.sections()):
total = len(children)
for i, obj in enumerate(children):
try:
output.append(
obj.write_all(
forcedlang, self._depth, self.dest_directory(), i, total
)
)
except LangNotFoundError:
pass # For now, do nothing
return output
# Perform all the write steps of this object
def write_all(
self, forced_lang: str, parentdepth: int, parentdir: str, index: int, total: int
) -> RecursiveList:
self.convert(forced_lang)
output: RecursiveList = super().write_all(parentdepth, parentdir, index, total)
output.append(self.write_children(forced_lang))
return output

View File

@ -5,7 +5,7 @@ from os.path import isfile
from shutil import rmtree
from spip2md.config import CFG
from spip2md.extended_models import RecursiveList, Section
from spip2md.extended_models import LangNotFoundError, RecursiveList, Section
from spip2md.spip_models import DB
from spip2md.style import BOLD, esc
@ -29,21 +29,27 @@ into the directory {esc(BOLD)}{parent_dir}{esc()}, \
as database user {esc(BOLD)}{CFG.db_user}{esc()}
"""
)
ROOTLOG.debug("Initialize root sections")
# Get all sections of parentID ROOTID
child_sections: tuple[Section, ...] = (
Section.select()
.where(Section.id_parent == ROOTID)
.where(Section.lang == CFG.export_languages[0])
.order_by(Section.date.desc())
)
nb: int = len(child_sections)
# Write each subsections (write their entire subtree)
for i, s in enumerate(child_sections):
ROOTLOG.debug(f"Begin exporting section {i}/{nb} {s._title}")
output.append(s.write_all(-1, CFG.output_dir, i, nb))
print() # Break line between level 0 sections in output
ROOTLOG.debug(f"Finished exporting section {i}/{nb} {s._title}")
# Write each sections (write their entire subtree) for each export language
# Force objects to handle <multi> blocks by setting them a lang
# Do this heavy looping because we dont know if languages are set in database or
# in markup, and as such language specified in database can differ from markup
for lang in CFG.export_languages:
ROOTLOG.debug("Initialize root sections")
# Get all sections of parentID ROOTID
child_sections: tuple[Section, ...] = (
Section.select()
.where(Section.id_parent == ROOTID)
.order_by(Section.date.desc())
)
nb: int = len(child_sections)
for i, s in enumerate(child_sections):
ROOTLOG.debug(f"Begin exporting {lang} root section {i}/{nb}")
try:
output.append(s.write_all(lang, -1, CFG.output_dir, i, nb))
except LangNotFoundError:
pass # For now, do nothing
print() # Break line between level 0 sections in output
ROOTLOG.debug(f"Finished exporting {lang} root section {i}/{nb} {s._title}")
return output

View File

@ -168,9 +168,13 @@ SECTION_LINK = (
# Multi language block, to be further processed per lang
MULTILANG_BLOCK = compile(r"<multi>(.+?)<\/multi>", S | I)
MULTILANGS = compile(
MULTILANGS = compile( # Matches agains all langs of multi blocks
r"\[([a-zA-Z\-]{2,6})\]\s*(.+?)\s*(?=\[[a-zA-Z\-]{2,6}\]|$)", S | I
)
CONFIGLANGS = { # lang of configuration: (match against this lang)
lang: compile(r"\[ *" + lang + r" *\]\s*(.+?)\s*(?=\[[a-zA-Z\-]{2,6}\]|$)", S | I)
for lang in CFG.export_languages
}
# WARNING probably useless text in metadata fields, to be removed
BLOAT = (
@ -315,7 +319,7 @@ SPECIAL_OUTPUT = (
compile(r"^([0-9]+?\.)(?= )"), # Counter
compile(r"(?<= )(->)(?= )"), # Arrow
compile(r"(?<=^Exporting )([0-9]+?)(?= )"), # Total
) + tuple(compile(r"(" + language + r"\:)") for language in CFG.export_languages)
) + tuple(compile(r"( " + language + r" )") for language in CFG.export_languages)
# Warning elements in terminal output to highlight
WARNING_OUTPUT = (