begin translations by letting translate create translated sub-RedactionalObjects

This commit is contained in:
Guilhem Fauré 2023-06-01 11:51:55 +02:00
parent bc74fb0bfb
commit 123ae5945b

View File

@ -1,13 +1,21 @@
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
import logging import logging
from copy import deepcopy
from os import makedirs from os import makedirs
from os.path import basename, splitext from os.path import basename, splitext
from re import Pattern, finditer, search from re import Pattern, finditer, search
from shutil import copyfile from shutil import copyfile
from typing import Any, Optional from typing import Any, Optional
from peewee import DateTimeField, DoesNotExist from peewee import (
BigAutoField,
BigIntegerField,
DateTimeField,
DoesNotExist,
IntegerField,
)
from slugify import slugify from slugify import slugify
from typing_extensions import Self
from yaml import dump from yaml import dump
from spip2md.config import CFG from spip2md.config import CFG
@ -45,8 +53,8 @@ class SpipNormalized:
statut: str statut: str
# profondeur: int # profondeur: int
# Custom # Custom
obj_id: int = 0 # database ID of object, but same attribute name for all objects obj_id: BigAutoField | int = 0 # same ID attribute name for all objects
depth: int # Equals `profondeur` for sections depth: IntegerField | int # Equals `profondeur` for sections
fileprefix: str # String to prepend to written files fileprefix: str # String to prepend to written files
parentdir: str # Path from output dir to direct parent parentdir: str # Path from output dir to direct parent
style: tuple[int, ...] # Styles to apply to some elements of printed output style: tuple[int, ...] # Styles to apply to some elements of printed output
@ -74,8 +82,8 @@ class NormalizedSection(SpipNormalized, SpipRubriques):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.obj_id = self.id_rubrique.cast(as_type="int") self.obj_id = self.id_rubrique
self.depth = self.profondeur.cast(as_type="int") self.depth = self.profondeur
class NormalizedArticle(SpipNormalized, SpipArticles): class NormalizedArticle(SpipNormalized, SpipArticles):
@ -84,7 +92,7 @@ class NormalizedArticle(SpipNormalized, SpipArticles):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.obj_id = self.id_article.cast(as_type="int") self.obj_id = self.id_article
class NormalizedDocument(SpipNormalized, SpipDocuments): class NormalizedDocument(SpipNormalized, SpipDocuments):
@ -93,71 +101,36 @@ class NormalizedDocument(SpipNormalized, SpipDocuments):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.obj_id = self.id_document.cast(as_type="int") self.obj_id = self.id_document
class WritableObject(SpipNormalized): class WritableObject(SpipNormalized):
translations: dict[str, Self]
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
# Set the lang attribute of self to the first one detected # Detect every language present in <multi> blocks of text
# Then, if theres other langs remaining, instanciate a new object with same # For each language in <multi> block, output a new object with the translation
# input text but stripped of first lang def translate_multi(self, text: str) -> dict[str, str]:
# Then returns the text of first detected language # title: str = self.title() # Memoize self title # WARNING recurses
# WARNING currently only supports ONE <multi> block per text title: str = self.titre.strip() # Memoize self title # WARNING recurses
def translate_multi(self, text: str) -> str: translations: dict[str, str] = {self.lang: text} # Dict such as lang: text
# Memoize self title # for each langs of <multi> blocks, add its text to the corresponding dict key
title: str = self.title() for block in MULTILANG_BLOCK.finditer(text):
# First translation found, with eventual preexisting text for lang in MULTILANGS.finditer(block.group(1)):
current_translation: str = text if lang.group(1) == self.lang:
next_text: str = text # <multi> block(s) without first lang translations[self.lang] = translations[self.lang].replace(
block = MULTILANG_BLOCK.search(text)
if block is not None:
lang = MULTILANGS.search(block.group(1))
if lang is not None:
# set current lang to found first lang
self.lang = lang.group(1)
# replace multi blocks of current text with first lang
current_translation = current_translation.replace(
block.group(), lang.group(2) block.group(), lang.group(2)
) )
# Log the translation elif lang.group(1) in translations:
translated: str = lang.group(2)[:60].strip() translations[lang.group(1)] += lang.group(2)
logging.info(
f"{title} lang becomes {self.lang}, with text {translated}"
)
# remove first lang from next_text
next_text = next_text.replace(lang.group(), "")
else: else:
# Log the unexpected situation translations[lang.group(1)] = lang.group(2)
logging.warning( # Logs the translation
f"Unexpected empty <multi> block in {title}, deleting it anyway" translated: str = lang.group(2)[:50].strip()
) logging.info(f"{title} {lang.group(1)} translation: {translated}")
# Do the same for the next text return translations
next_block = MULTILANG_BLOCK.search(next_text)
if next_block is not None:
next_lang = MULTILANGS.search(next_block.group(1))
if next_lang is not None:
# If there is a remaining lang
# Instantiate & write a similar object with modified text & lang
logging.info(f"Instanciate {next_lang.group(1)} translation of {title}")
next_lang_obj: WritableObject = type(self)(
texte=next_text,
lang=next_lang.group(1),
titre=self.titre,
descriptif=self.descriptif,
)
next_lang_obj.style = self.style
next_lang_obj.depth = self.depth
next_lang_obj.parentdir = self.dest_directory()
# WARNING the output will appear in terminal & logfile but wont return
next_lang_obj.begin_message(0, 0) # WARNING wrong counter
try:
next_lang_obj.end_message(next_lang_obj.write())
except Exception as err:
next_lang_obj.end_message(err)
# Return the first detected language
return current_translation
# Apply a mapping from regex maps # Apply a mapping from regex maps
@staticmethod @staticmethod
@ -246,7 +219,9 @@ class WritableObject(SpipNormalized):
return stylized return stylized
# Print the message telling what is going to be done # Print the message telling what is going to be done
def begin_message(self, index: int, limit: int, step: int = 100) -> list[str]: def begin_message(
self, index: int, limit: int, prepend: str = "", step: int = 100
) -> list[str]:
output: list[str] = [] output: list[str] = []
# Output the remaining number of objects to export every step object # Output the remaining number of objects to export every step object
if index % step == 0: if index % step == 0:
@ -258,6 +233,7 @@ class WritableObject(SpipNormalized):
self.style_print(output[-1]) self.style_print(output[-1])
# Output the counter & title of the object being exported # Output the counter & title of the object being exported
output.append(f"{index + 1}. ") output.append(f"{index + 1}. ")
output.append(prepend)
if len(self.title()) == 0: if len(self.title()) == 0:
output[-1] += "EMPTY NAME" output[-1] += "EMPTY NAME"
else: else:
@ -306,11 +282,12 @@ class Document(WritableObject, NormalizedDocument):
class RedactionalObject(WritableObject): class RedactionalObject(WritableObject):
id_trad: int id_trad: BigIntegerField | int
id_rubrique: int id_rubrique: BigIntegerField | int
# date: DateTimeField | str
date: DateTimeField date: DateTimeField
maj: str maj: str
id_secteur: int id_secteur: BigIntegerField | int
extra: str extra: str
langue_choisie: str langue_choisie: str
# Custom # Custom
@ -342,6 +319,22 @@ class RedactionalObject(WritableObject):
) )
return text return text
def title(self) -> str:
if self.texte is None:
return ""
if len(self.texte) == 0:
return ""
text: str = self.texte
# Handle <multi> multi language blocks
for lang, translation in self.translate_multi(text):
if lang == self.lang:
text = translation
else:
self.translations: dict[str, Self] = {}
self.translations[lang] = deepcopy(self)
self.translations[lang].titre = translation
return self.convert_field(text)
def text(self) -> str: def text(self) -> str:
if self.texte is None: if self.texte is None:
return "" return ""
@ -349,7 +342,13 @@ class RedactionalObject(WritableObject):
return "" return ""
text: str = self.texte text: str = self.texte
# Handle <multi> multi language blocks # Handle <multi> multi language blocks
text = self.translate_multi(text) for lang, translation in self.translate_multi(text):
if lang == self.lang:
text = translation
else:
self.translations: dict[str, Self] = {}
self.translations[lang] = deepcopy(self)
self.translations[lang].texte = translation
# Replace ID based SPIP links with relative path links # Replace ID based SPIP links with relative path links
text = self.replace_links(text, DOCUMENT_LINK, Document) text = self.replace_links(text, DOCUMENT_LINK, Document)
text = self.replace_links(text, ARTICLE_LINK, Article) text = self.replace_links(text, ARTICLE_LINK, Article)