begin translations by letting translate create translated sub-RedactionalObjects

2023-06-01 11:51:55 +02:00 · 2023-06-01 11:51:55 +02:00 · 123ae5945b
commit 123ae5945b
parent bc74fb0bfb
1 changed files with 69 additions and 70 deletions
--- a/spip2md/extended_models.py
+++ b/spip2md/extended_models.py
@ -1,13 +1,21 @@
 # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
 import logging
 from copy import deepcopy
 from os import makedirs
 from os.path import basename, splitext
 from re import Pattern, finditer, search
 from shutil import copyfile
 from typing import Any, Optional
-from peewee import DateTimeField, DoesNotExist
+from peewee import (
    BigAutoField,
    BigIntegerField,
    DateTimeField,
    DoesNotExist,
    IntegerField,
 )
 from slugify import slugify
 from typing_extensions import Self
 from yaml import dump
 from spip2md.config import CFG
@ -45,8 +53,8 @@ class SpipNormalized:
    statut: str
    # profondeur: int
    # Custom
-    obj_id: int = 0  # database ID of object, but same attribute name for all objects
+    obj_id: BigAutoField | int = 0  # same ID attribute name for all objects
-    depth: int  # Equals `profondeur` for sections
+    depth: IntegerField | int  # Equals `profondeur` for sections
    fileprefix: str  # String to prepend to written files
    parentdir: str  # Path from output dir to direct parent
    style: tuple[int, ...]  # Styles to apply to some elements of printed output
@ -74,8 +82,8 @@ class NormalizedSection(SpipNormalized, SpipRubriques):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
-        self.obj_id = self.id_rubrique.cast(as_type="int")
+        self.obj_id = self.id_rubrique
-        self.depth = self.profondeur.cast(as_type="int")
+        self.depth = self.profondeur
 class NormalizedArticle(SpipNormalized, SpipArticles):
@ -84,7 +92,7 @@ class NormalizedArticle(SpipNormalized, SpipArticles):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
-        self.obj_id = self.id_article.cast(as_type="int")
+        self.obj_id = self.id_article
 class NormalizedDocument(SpipNormalized, SpipDocuments):
@ -93,71 +101,36 @@ class NormalizedDocument(SpipNormalized, SpipDocuments):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
-        self.obj_id = self.id_document.cast(as_type="int")
+        self.obj_id = self.id_document
 class WritableObject(SpipNormalized):
    translations: dict[str, Self]
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
-    # Set the lang attribute of self to the first one detected
+    # Detect every language present in <multi> blocks of text
-    # Then, if there’s other langs remaining, instanciate a new object with same
+    # For each language in <multi> block, output a new object with the translation
-    # input text but stripped of first lang
+    def translate_multi(self, text: str) -> dict[str, str]:
-    # Then returns the text of first detected language
+        # title: str = self.title()  # Memoize self title # WARNING recurses
-    # WARNING currently only supports ONE <multi> block per text
+        title: str = self.titre.strip()  # Memoize self title # WARNING recurses
-    def translate_multi(self, text: str) -> str:
+        translations: dict[str, str] = {self.lang: text}  # Dict such as lang: text
-        # Memoize self title
+        # for each langs of <multi> blocks, add its text to the corresponding dict key
-        title: str = self.title()
+        for block in MULTILANG_BLOCK.finditer(text):
-        # First translation found, with eventual preexisting text
+            for lang in MULTILANGS.finditer(block.group(1)):
-        current_translation: str = text
+                if lang.group(1) == self.lang:
-        next_text: str = text  # <multi> block(s) without first lang
+                    translations[self.lang] = translations[self.lang].replace(
        block = MULTILANG_BLOCK.search(text)
        if block is not None:
            lang = MULTILANGS.search(block.group(1))
            if lang is not None:
                # set current lang to found first lang
                self.lang = lang.group(1)
                # replace multi blocks of current text with first lang
                current_translation = current_translation.replace(
                        block.group(), lang.group(2)
                    )
-                # Log the translation
+                elif lang.group(1) in translations:
-                translated: str = lang.group(2)[:60].strip()
+                    translations[lang.group(1)] += lang.group(2)
                logging.info(
                    f"{title} lang becomes {self.lang}, with text {translated} …"
                )
                # remove first lang from next_text
                next_text = next_text.replace(lang.group(), "")
                else:
-                # Log the unexpected situation
+                    translations[lang.group(1)] = lang.group(2)
-                logging.warning(
+                # Logs the translation
-                    f"Unexpected empty <multi> block in {title}, deleting it anyway"
+                translated: str = lang.group(2)[:50].strip()
-                )
+                logging.info(f"{title} {lang.group(1)} translation: {translated}")
-        # Do the same for the next text
+        return translations
        next_block = MULTILANG_BLOCK.search(next_text)
        if next_block is not None:
            next_lang = MULTILANGS.search(next_block.group(1))
            if next_lang is not None:
                # If there is a remaining lang
                # Instantiate & write a similar object with modified text & lang
                logging.info(f"Instanciate {next_lang.group(1)} translation of {title}")
                next_lang_obj: WritableObject = type(self)(
                    texte=next_text,
                    lang=next_lang.group(1),
                    titre=self.titre,
                    descriptif=self.descriptif,
                )
                next_lang_obj.style = self.style
                next_lang_obj.depth = self.depth
                next_lang_obj.parentdir = self.dest_directory()
                # WARNING the output will appear in terminal & logfile but won’t return
                next_lang_obj.begin_message(0, 0)  # WARNING wrong counter
                try:
                    next_lang_obj.end_message(next_lang_obj.write())
                except Exception as err:
                    next_lang_obj.end_message(err)
        # Return the first detected language
        return current_translation
    # Apply a mapping from regex maps
    @staticmethod
@ -246,7 +219,9 @@ class WritableObject(SpipNormalized):
        return stylized
    # Print the message telling what is going to be done
-    def begin_message(self, index: int, limit: int, step: int = 100) -> list[str]:
+    def begin_message(
        self, index: int, limit: int, prepend: str = "", step: int = 100
    ) -> list[str]:
        output: list[str] = []
        # Output the remaining number of objects to export every step object
        if index % step == 0:
@ -258,6 +233,7 @@ class WritableObject(SpipNormalized):
            self.style_print(output[-1])
        # Output the counter & title of the object being exported
        output.append(f"{index + 1}. ")
        output.append(prepend)
        if len(self.title()) == 0:
            output[-1] += "EMPTY NAME"
        else:
@ -306,11 +282,12 @@ class Document(WritableObject, NormalizedDocument):
 class RedactionalObject(WritableObject):
-    id_trad: int
+    id_trad: BigIntegerField | int
-    id_rubrique: int
+    id_rubrique: BigIntegerField | int
    # date: DateTimeField | str
    date: DateTimeField
    maj: str
-    id_secteur: int
+    id_secteur: BigIntegerField | int
    extra: str
    langue_choisie: str
    # Custom
@ -342,6 +319,22 @@ class RedactionalObject(WritableObject):
                    )
        return text
    def title(self) -> str:
        if self.texte is None:
            return ""
        if len(self.texte) == 0:
            return ""
        text: str = self.texte
        # Handle <multi> multi language blocks
        for lang, translation in self.translate_multi(text):
            if lang == self.lang:
                text = translation
            else:
                self.translations: dict[str, Self] = {}
                self.translations[lang] = deepcopy(self)
                self.translations[lang].titre = translation
        return self.convert_field(text)
    def text(self) -> str:
        if self.texte is None:
            return ""
@ -349,7 +342,13 @@ class RedactionalObject(WritableObject):
            return ""
        text: str = self.texte
        # Handle <multi> multi language blocks
-        text = self.translate_multi(text)
+        for lang, translation in self.translate_multi(text):
            if lang == self.lang:
                text = translation
            else:
                self.translations: dict[str, Self] = {}
                self.translations[lang] = deepcopy(self)
                self.translations[lang].texte = translation
        # Replace ID based SPIP links with relative path links
        text = self.replace_links(text, DOCUMENT_LINK, Document)
        text = self.replace_links(text, ARTICLE_LINK, Article)