fix html cleaning & reenabled body converting

2023-05-30 12:39:10 +02:00 · 2023-05-30 12:39:10 +02:00 · 9c79433f74
commit 9c79433f74
parent d20976c59d
2 changed files with 27 additions and 21 deletions
--- a/spip2md/init.py
+++ b/spip2md/init.py
@ -1,6 +1,6 @@
 # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
 import sys
-from os import makedirs, remove
+from os import makedirs
 from shutil import rmtree
 from spip2md.config import CFG
@ -48,9 +48,6 @@ def main(*argv):
    if CFG.clear_output:
        rmtree(CFG.output_dir, True)
    makedirs(CFG.output_dir, exist_ok=True)
    # Clear the log file
    # if CFG.clear_log:
    #     remove(CFG.logfile)
    # Get the virtual id=0 section
    root: Rubrique = RootRubrique()
--- a/spip2md/spipobjects.py
+++ b/spip2md/spipobjects.py
@ -1,6 +1,6 @@
 # SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
 import logging
-from os import makedirs
+from os import makedirs, remove
 from os.path import basename, splitext
 from re import finditer, search, sub
 from shutil import copyfile
@ -34,6 +34,9 @@ from spip2md.regexmap import (
 )
 from spip2md.style import BLUE, BOLD, GREEN, WARNING_STYLE, YELLOW, esc
 # Clear the previous log file if needed
 if CFG.clear_log:
    remove(CFG.logfile)
 # Output logs to logfile
 logging.basicConfig(filename=CFG.logfile, encoding="utf-8")
@ -69,8 +72,16 @@ class SpipWritable:
        return MULTILANG_BLOCK.sub(replace_lang, text)
    # Remove remaining HTML tags
    @staticmethod
    def clean_html(string: str) -> str:
        if string is not None and len(string) > 0:
            return HTMLTAG.sub("", string)
        else:
            return ""
    # Apply different mappings to a text field, like SPIP to Markdown or encoding
-    def convert(self, text: Optional[str]) -> str:
+    def convert(self, text: Optional[str], clean_html: bool = True) -> str:
        # Return unknown char surrounded by context_length chars
        def unknown_chars_context(text: str, char: str, context_len: int = 24) -> str:
            context: str = r".{0," + str(context_len) + r"}"
@ -84,12 +95,21 @@ class SpipWritable:
                return char
        if text is not None and len(text) > 0:
            # Convert SPIP syntax to Markdown
            for spip, markdown in SPIP_MARKDOWN:
                text = spip.sub(markdown, text)
            # Remove useless text
            for bloat in BLOAT:
                text = bloat.sub("", text)
            # Convert broken ISO encoding to UTF
            for iso, utf in ISO_UTF:
                text = text.replace(iso, utf)
            # Handle <multi> multi language blocks
            text = self.translate(text)
            # Delete remaining HTML tags in body WARNING
            if clean_html:
                text = self.clean_html(text)
            # Warn about unknown chars
            for char in UNKNOWN_ISO:
                lastend: int = 0
                for match in finditer("(" + char + ")+", text):
@ -98,7 +118,6 @@ class SpipWritable:
                        f"Unknown char {char} found in {self.titre[:40]} at: {context}"
                    )
                    lastend = match.end()
            text = self.translate(text)
        else:
            return ""
        return text
@ -131,7 +150,6 @@ class SpipWritable:
        # Output the remaining number of objects to export every step object
        if index % step == 0:
            output.append(f"Exporting {limit-index}")
            if hasattr(self, "profondeur"):
            output[-1] += f" level {self.profondeur}"
            s: str = "s" if limit - index > 1 else ""
            output[-1] += f" {type(self).__name__}{s}"
@ -302,21 +320,12 @@ class SpipObject(SpipWritable):
            body += "\n\n# EXTRA\n\n" + self.extra
        return body
    # Clean remaining HTML tags in attrs
    def clean_html(self, *attrs: str) -> None:
        attrs += "titre", "texte", "descriptif", "extra"
        for attr in attrs:
            a = getattr(self, attr)
            if len(a) > 0:
                setattr(self, attr, HTMLTAG.sub("", a))
    # Write object to output destination
-    def write(self, parent_dir: str, clean_html: bool = True) -> str:
+    def write(self, parent_dir: str) -> str:
        # Link articles
        self.link_articles()
-        # Delete remaining HTML tags WARNING
+        # Convert body after linking articles
-        if clean_html:
+        self.texte = self.convert(self.texte)
            self.clean_html()
        # Define actual export directory
        directory: str = parent_dir + self.dir_slug()
        # Make a directory for this object if there isn’t