begin translations by letting translate create translated sub-RedactionalObjects
This commit is contained in:
parent
bc74fb0bfb
commit
123ae5945b
@ -1,13 +1,21 @@
|
|||||||
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
|
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
|
||||||
import logging
|
import logging
|
||||||
|
from copy import deepcopy
|
||||||
from os import makedirs
|
from os import makedirs
|
||||||
from os.path import basename, splitext
|
from os.path import basename, splitext
|
||||||
from re import Pattern, finditer, search
|
from re import Pattern, finditer, search
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
from peewee import DateTimeField, DoesNotExist
|
from peewee import (
|
||||||
|
BigAutoField,
|
||||||
|
BigIntegerField,
|
||||||
|
DateTimeField,
|
||||||
|
DoesNotExist,
|
||||||
|
IntegerField,
|
||||||
|
)
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
|
from typing_extensions import Self
|
||||||
from yaml import dump
|
from yaml import dump
|
||||||
|
|
||||||
from spip2md.config import CFG
|
from spip2md.config import CFG
|
||||||
@ -45,8 +53,8 @@ class SpipNormalized:
|
|||||||
statut: str
|
statut: str
|
||||||
# profondeur: int
|
# profondeur: int
|
||||||
# Custom
|
# Custom
|
||||||
obj_id: int = 0 # database ID of object, but same attribute name for all objects
|
obj_id: BigAutoField | int = 0 # same ID attribute name for all objects
|
||||||
depth: int # Equals `profondeur` for sections
|
depth: IntegerField | int # Equals `profondeur` for sections
|
||||||
fileprefix: str # String to prepend to written files
|
fileprefix: str # String to prepend to written files
|
||||||
parentdir: str # Path from output dir to direct parent
|
parentdir: str # Path from output dir to direct parent
|
||||||
style: tuple[int, ...] # Styles to apply to some elements of printed output
|
style: tuple[int, ...] # Styles to apply to some elements of printed output
|
||||||
@ -74,8 +82,8 @@ class NormalizedSection(SpipNormalized, SpipRubriques):
|
|||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.obj_id = self.id_rubrique.cast(as_type="int")
|
self.obj_id = self.id_rubrique
|
||||||
self.depth = self.profondeur.cast(as_type="int")
|
self.depth = self.profondeur
|
||||||
|
|
||||||
|
|
||||||
class NormalizedArticle(SpipNormalized, SpipArticles):
|
class NormalizedArticle(SpipNormalized, SpipArticles):
|
||||||
@ -84,7 +92,7 @@ class NormalizedArticle(SpipNormalized, SpipArticles):
|
|||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.obj_id = self.id_article.cast(as_type="int")
|
self.obj_id = self.id_article
|
||||||
|
|
||||||
|
|
||||||
class NormalizedDocument(SpipNormalized, SpipDocuments):
|
class NormalizedDocument(SpipNormalized, SpipDocuments):
|
||||||
@ -93,71 +101,36 @@ class NormalizedDocument(SpipNormalized, SpipDocuments):
|
|||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.obj_id = self.id_document.cast(as_type="int")
|
self.obj_id = self.id_document
|
||||||
|
|
||||||
|
|
||||||
class WritableObject(SpipNormalized):
|
class WritableObject(SpipNormalized):
|
||||||
|
translations: dict[str, Self]
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
# Set the lang attribute of self to the first one detected
|
# Detect every language present in <multi> blocks of text
|
||||||
# Then, if there’s other langs remaining, instanciate a new object with same
|
# For each language in <multi> block, output a new object with the translation
|
||||||
# input text but stripped of first lang
|
def translate_multi(self, text: str) -> dict[str, str]:
|
||||||
# Then returns the text of first detected language
|
# title: str = self.title() # Memoize self title # WARNING recurses
|
||||||
# WARNING currently only supports ONE <multi> block per text
|
title: str = self.titre.strip() # Memoize self title # WARNING recurses
|
||||||
def translate_multi(self, text: str) -> str:
|
translations: dict[str, str] = {self.lang: text} # Dict such as lang: text
|
||||||
# Memoize self title
|
# for each langs of <multi> blocks, add its text to the corresponding dict key
|
||||||
title: str = self.title()
|
for block in MULTILANG_BLOCK.finditer(text):
|
||||||
# First translation found, with eventual preexisting text
|
for lang in MULTILANGS.finditer(block.group(1)):
|
||||||
current_translation: str = text
|
if lang.group(1) == self.lang:
|
||||||
next_text: str = text # <multi> block(s) without first lang
|
translations[self.lang] = translations[self.lang].replace(
|
||||||
block = MULTILANG_BLOCK.search(text)
|
|
||||||
if block is not None:
|
|
||||||
lang = MULTILANGS.search(block.group(1))
|
|
||||||
if lang is not None:
|
|
||||||
# set current lang to found first lang
|
|
||||||
self.lang = lang.group(1)
|
|
||||||
# replace multi blocks of current text with first lang
|
|
||||||
current_translation = current_translation.replace(
|
|
||||||
block.group(), lang.group(2)
|
block.group(), lang.group(2)
|
||||||
)
|
)
|
||||||
# Log the translation
|
elif lang.group(1) in translations:
|
||||||
translated: str = lang.group(2)[:60].strip()
|
translations[lang.group(1)] += lang.group(2)
|
||||||
logging.info(
|
|
||||||
f"{title} lang becomes {self.lang}, with text {translated} …"
|
|
||||||
)
|
|
||||||
# remove first lang from next_text
|
|
||||||
next_text = next_text.replace(lang.group(), "")
|
|
||||||
else:
|
else:
|
||||||
# Log the unexpected situation
|
translations[lang.group(1)] = lang.group(2)
|
||||||
logging.warning(
|
# Logs the translation
|
||||||
f"Unexpected empty <multi> block in {title}, deleting it anyway"
|
translated: str = lang.group(2)[:50].strip()
|
||||||
)
|
logging.info(f"{title} {lang.group(1)} translation: {translated}")
|
||||||
# Do the same for the next text
|
return translations
|
||||||
next_block = MULTILANG_BLOCK.search(next_text)
|
|
||||||
if next_block is not None:
|
|
||||||
next_lang = MULTILANGS.search(next_block.group(1))
|
|
||||||
if next_lang is not None:
|
|
||||||
# If there is a remaining lang
|
|
||||||
# Instantiate & write a similar object with modified text & lang
|
|
||||||
logging.info(f"Instanciate {next_lang.group(1)} translation of {title}")
|
|
||||||
next_lang_obj: WritableObject = type(self)(
|
|
||||||
texte=next_text,
|
|
||||||
lang=next_lang.group(1),
|
|
||||||
titre=self.titre,
|
|
||||||
descriptif=self.descriptif,
|
|
||||||
)
|
|
||||||
next_lang_obj.style = self.style
|
|
||||||
next_lang_obj.depth = self.depth
|
|
||||||
next_lang_obj.parentdir = self.dest_directory()
|
|
||||||
# WARNING the output will appear in terminal & logfile but won’t return
|
|
||||||
next_lang_obj.begin_message(0, 0) # WARNING wrong counter
|
|
||||||
try:
|
|
||||||
next_lang_obj.end_message(next_lang_obj.write())
|
|
||||||
except Exception as err:
|
|
||||||
next_lang_obj.end_message(err)
|
|
||||||
# Return the first detected language
|
|
||||||
return current_translation
|
|
||||||
|
|
||||||
# Apply a mapping from regex maps
|
# Apply a mapping from regex maps
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -246,7 +219,9 @@ class WritableObject(SpipNormalized):
|
|||||||
return stylized
|
return stylized
|
||||||
|
|
||||||
# Print the message telling what is going to be done
|
# Print the message telling what is going to be done
|
||||||
def begin_message(self, index: int, limit: int, step: int = 100) -> list[str]:
|
def begin_message(
|
||||||
|
self, index: int, limit: int, prepend: str = "", step: int = 100
|
||||||
|
) -> list[str]:
|
||||||
output: list[str] = []
|
output: list[str] = []
|
||||||
# Output the remaining number of objects to export every step object
|
# Output the remaining number of objects to export every step object
|
||||||
if index % step == 0:
|
if index % step == 0:
|
||||||
@ -258,6 +233,7 @@ class WritableObject(SpipNormalized):
|
|||||||
self.style_print(output[-1])
|
self.style_print(output[-1])
|
||||||
# Output the counter & title of the object being exported
|
# Output the counter & title of the object being exported
|
||||||
output.append(f"{index + 1}. ")
|
output.append(f"{index + 1}. ")
|
||||||
|
output.append(prepend)
|
||||||
if len(self.title()) == 0:
|
if len(self.title()) == 0:
|
||||||
output[-1] += "EMPTY NAME"
|
output[-1] += "EMPTY NAME"
|
||||||
else:
|
else:
|
||||||
@ -306,11 +282,12 @@ class Document(WritableObject, NormalizedDocument):
|
|||||||
|
|
||||||
|
|
||||||
class RedactionalObject(WritableObject):
|
class RedactionalObject(WritableObject):
|
||||||
id_trad: int
|
id_trad: BigIntegerField | int
|
||||||
id_rubrique: int
|
id_rubrique: BigIntegerField | int
|
||||||
|
# date: DateTimeField | str
|
||||||
date: DateTimeField
|
date: DateTimeField
|
||||||
maj: str
|
maj: str
|
||||||
id_secteur: int
|
id_secteur: BigIntegerField | int
|
||||||
extra: str
|
extra: str
|
||||||
langue_choisie: str
|
langue_choisie: str
|
||||||
# Custom
|
# Custom
|
||||||
@ -342,6 +319,22 @@ class RedactionalObject(WritableObject):
|
|||||||
)
|
)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def title(self) -> str:
|
||||||
|
if self.texte is None:
|
||||||
|
return ""
|
||||||
|
if len(self.texte) == 0:
|
||||||
|
return ""
|
||||||
|
text: str = self.texte
|
||||||
|
# Handle <multi> multi language blocks
|
||||||
|
for lang, translation in self.translate_multi(text):
|
||||||
|
if lang == self.lang:
|
||||||
|
text = translation
|
||||||
|
else:
|
||||||
|
self.translations: dict[str, Self] = {}
|
||||||
|
self.translations[lang] = deepcopy(self)
|
||||||
|
self.translations[lang].titre = translation
|
||||||
|
return self.convert_field(text)
|
||||||
|
|
||||||
def text(self) -> str:
|
def text(self) -> str:
|
||||||
if self.texte is None:
|
if self.texte is None:
|
||||||
return ""
|
return ""
|
||||||
@ -349,7 +342,13 @@ class RedactionalObject(WritableObject):
|
|||||||
return ""
|
return ""
|
||||||
text: str = self.texte
|
text: str = self.texte
|
||||||
# Handle <multi> multi language blocks
|
# Handle <multi> multi language blocks
|
||||||
text = self.translate_multi(text)
|
for lang, translation in self.translate_multi(text):
|
||||||
|
if lang == self.lang:
|
||||||
|
text = translation
|
||||||
|
else:
|
||||||
|
self.translations: dict[str, Self] = {}
|
||||||
|
self.translations[lang] = deepcopy(self)
|
||||||
|
self.translations[lang].texte = translation
|
||||||
# Replace ID based SPIP links with relative path links
|
# Replace ID based SPIP links with relative path links
|
||||||
text = self.replace_links(text, DOCUMENT_LINK, Document)
|
text = self.replace_links(text, DOCUMENT_LINK, Document)
|
||||||
text = self.replace_links(text, ARTICLE_LINK, Article)
|
text = self.replace_links(text, ARTICLE_LINK, Article)
|
||||||
|
Loading…
Reference in New Issue
Block a user