begin translations by letting translate create translated sub-RedactionalObjects

This commit is contained in:
Guilhem Fauré 2023-06-01 11:51:55 +02:00
parent bc74fb0bfb
commit 123ae5945b

View File

@ -1,13 +1,21 @@
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
import logging
from copy import deepcopy
from os import makedirs
from os.path import basename, splitext
from re import Pattern, finditer, search
from shutil import copyfile
from typing import Any, Optional
from peewee import DateTimeField, DoesNotExist
from peewee import (
BigAutoField,
BigIntegerField,
DateTimeField,
DoesNotExist,
IntegerField,
)
from slugify import slugify
from typing_extensions import Self
from yaml import dump
from spip2md.config import CFG
@ -45,8 +53,8 @@ class SpipNormalized:
statut: str
# profondeur: int
# Custom
obj_id: int = 0 # database ID of object, but same attribute name for all objects
depth: int # Equals `profondeur` for sections
obj_id: BigAutoField | int = 0 # same ID attribute name for all objects
depth: IntegerField | int # Equals `profondeur` for sections
fileprefix: str # String to prepend to written files
parentdir: str # Path from output dir to direct parent
style: tuple[int, ...] # Styles to apply to some elements of printed output
@ -74,8 +82,8 @@ class NormalizedSection(SpipNormalized, SpipRubriques):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.obj_id = self.id_rubrique.cast(as_type="int")
self.depth = self.profondeur.cast(as_type="int")
self.obj_id = self.id_rubrique
self.depth = self.profondeur
class NormalizedArticle(SpipNormalized, SpipArticles):
@ -84,7 +92,7 @@ class NormalizedArticle(SpipNormalized, SpipArticles):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.obj_id = self.id_article.cast(as_type="int")
self.obj_id = self.id_article
class NormalizedDocument(SpipNormalized, SpipDocuments):
@ -93,71 +101,36 @@ class NormalizedDocument(SpipNormalized, SpipDocuments):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.obj_id = self.id_document.cast(as_type="int")
self.obj_id = self.id_document
class WritableObject(SpipNormalized):
translations: dict[str, Self]
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Set the lang attribute of self to the first one detected
# Then, if theres other langs remaining, instanciate a new object with same
# input text but stripped of first lang
# Then returns the text of first detected language
# WARNING currently only supports ONE <multi> block per text
def translate_multi(self, text: str) -> str:
# Memoize self title
title: str = self.title()
# First translation found, with eventual preexisting text
current_translation: str = text
next_text: str = text # <multi> block(s) without first lang
block = MULTILANG_BLOCK.search(text)
if block is not None:
lang = MULTILANGS.search(block.group(1))
if lang is not None:
# set current lang to found first lang
self.lang = lang.group(1)
# replace multi blocks of current text with first lang
current_translation = current_translation.replace(
block.group(), lang.group(2)
)
# Log the translation
translated: str = lang.group(2)[:60].strip()
logging.info(
f"{title} lang becomes {self.lang}, with text {translated}"
)
# remove first lang from next_text
next_text = next_text.replace(lang.group(), "")
else:
# Log the unexpected situation
logging.warning(
f"Unexpected empty <multi> block in {title}, deleting it anyway"
)
# Do the same for the next text
next_block = MULTILANG_BLOCK.search(next_text)
if next_block is not None:
next_lang = MULTILANGS.search(next_block.group(1))
if next_lang is not None:
# If there is a remaining lang
# Instantiate & write a similar object with modified text & lang
logging.info(f"Instanciate {next_lang.group(1)} translation of {title}")
next_lang_obj: WritableObject = type(self)(
texte=next_text,
lang=next_lang.group(1),
titre=self.titre,
descriptif=self.descriptif,
)
next_lang_obj.style = self.style
next_lang_obj.depth = self.depth
next_lang_obj.parentdir = self.dest_directory()
# WARNING the output will appear in terminal & logfile but wont return
next_lang_obj.begin_message(0, 0) # WARNING wrong counter
try:
next_lang_obj.end_message(next_lang_obj.write())
except Exception as err:
next_lang_obj.end_message(err)
# Return the first detected language
return current_translation
# Detect every language present in <multi> blocks of text
# For each language in <multi> block, output a new object with the translation
def translate_multi(self, text: str) -> dict[str, str]:
# title: str = self.title() # Memoize self title # WARNING recurses
title: str = self.titre.strip() # Memoize self title # WARNING recurses
translations: dict[str, str] = {self.lang: text} # Dict such as lang: text
# for each langs of <multi> blocks, add its text to the corresponding dict key
for block in MULTILANG_BLOCK.finditer(text):
for lang in MULTILANGS.finditer(block.group(1)):
if lang.group(1) == self.lang:
translations[self.lang] = translations[self.lang].replace(
block.group(), lang.group(2)
)
elif lang.group(1) in translations:
translations[lang.group(1)] += lang.group(2)
else:
translations[lang.group(1)] = lang.group(2)
# Logs the translation
translated: str = lang.group(2)[:50].strip()
logging.info(f"{title} {lang.group(1)} translation: {translated}")
return translations
# Apply a mapping from regex maps
@staticmethod
@ -246,7 +219,9 @@ class WritableObject(SpipNormalized):
return stylized
# Print the message telling what is going to be done
def begin_message(self, index: int, limit: int, step: int = 100) -> list[str]:
def begin_message(
self, index: int, limit: int, prepend: str = "", step: int = 100
) -> list[str]:
output: list[str] = []
# Output the remaining number of objects to export every step object
if index % step == 0:
@ -258,6 +233,7 @@ class WritableObject(SpipNormalized):
self.style_print(output[-1])
# Output the counter & title of the object being exported
output.append(f"{index + 1}. ")
output.append(prepend)
if len(self.title()) == 0:
output[-1] += "EMPTY NAME"
else:
@ -306,11 +282,12 @@ class Document(WritableObject, NormalizedDocument):
class RedactionalObject(WritableObject):
id_trad: int
id_rubrique: int
id_trad: BigIntegerField | int
id_rubrique: BigIntegerField | int
# date: DateTimeField | str
date: DateTimeField
maj: str
id_secteur: int
id_secteur: BigIntegerField | int
extra: str
langue_choisie: str
# Custom
@ -342,6 +319,22 @@ class RedactionalObject(WritableObject):
)
return text
def title(self) -> str:
if self.texte is None:
return ""
if len(self.texte) == 0:
return ""
text: str = self.texte
# Handle <multi> multi language blocks
for lang, translation in self.translate_multi(text):
if lang == self.lang:
text = translation
else:
self.translations: dict[str, Self] = {}
self.translations[lang] = deepcopy(self)
self.translations[lang].titre = translation
return self.convert_field(text)
def text(self) -> str:
if self.texte is None:
return ""
@ -349,7 +342,13 @@ class RedactionalObject(WritableObject):
return ""
text: str = self.texte
# Handle <multi> multi language blocks
text = self.translate_multi(text)
for lang, translation in self.translate_multi(text):
if lang == self.lang:
text = translation
else:
self.translations: dict[str, Self] = {}
self.translations[lang] = deepcopy(self)
self.translations[lang].texte = translation
# Replace ID based SPIP links with relative path links
text = self.replace_links(text, DOCUMENT_LINK, Document)
text = self.replace_links(text, ARTICLE_LINK, Article)