fix html cleaning & reenabled body converting
This commit is contained in:
parent
d20976c59d
commit
9c79433f74
@ -1,6 +1,6 @@
|
|||||||
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
|
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
|
||||||
import sys
|
import sys
|
||||||
from os import makedirs, remove
|
from os import makedirs
|
||||||
from shutil import rmtree
|
from shutil import rmtree
|
||||||
|
|
||||||
from spip2md.config import CFG
|
from spip2md.config import CFG
|
||||||
@ -48,9 +48,6 @@ def main(*argv):
|
|||||||
if CFG.clear_output:
|
if CFG.clear_output:
|
||||||
rmtree(CFG.output_dir, True)
|
rmtree(CFG.output_dir, True)
|
||||||
makedirs(CFG.output_dir, exist_ok=True)
|
makedirs(CFG.output_dir, exist_ok=True)
|
||||||
# Clear the log file
|
|
||||||
# if CFG.clear_log:
|
|
||||||
# remove(CFG.logfile)
|
|
||||||
|
|
||||||
# Get the virtual id=0 section
|
# Get the virtual id=0 section
|
||||||
root: Rubrique = RootRubrique()
|
root: Rubrique = RootRubrique()
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
|
# SPIP website to plain Markdown files converter, Copyright (C) 2023 Guilhem Fauré
|
||||||
import logging
|
import logging
|
||||||
from os import makedirs
|
from os import makedirs, remove
|
||||||
from os.path import basename, splitext
|
from os.path import basename, splitext
|
||||||
from re import finditer, search, sub
|
from re import finditer, search, sub
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
@ -34,6 +34,9 @@ from spip2md.regexmap import (
|
|||||||
)
|
)
|
||||||
from spip2md.style import BLUE, BOLD, GREEN, WARNING_STYLE, YELLOW, esc
|
from spip2md.style import BLUE, BOLD, GREEN, WARNING_STYLE, YELLOW, esc
|
||||||
|
|
||||||
|
# Clear the previous log file if needed
|
||||||
|
if CFG.clear_log:
|
||||||
|
remove(CFG.logfile)
|
||||||
# Output logs to logfile
|
# Output logs to logfile
|
||||||
logging.basicConfig(filename=CFG.logfile, encoding="utf-8")
|
logging.basicConfig(filename=CFG.logfile, encoding="utf-8")
|
||||||
|
|
||||||
@ -69,8 +72,16 @@ class SpipWritable:
|
|||||||
|
|
||||||
return MULTILANG_BLOCK.sub(replace_lang, text)
|
return MULTILANG_BLOCK.sub(replace_lang, text)
|
||||||
|
|
||||||
|
# Remove remaining HTML tags
|
||||||
|
@staticmethod
|
||||||
|
def clean_html(string: str) -> str:
|
||||||
|
if string is not None and len(string) > 0:
|
||||||
|
return HTMLTAG.sub("", string)
|
||||||
|
else:
|
||||||
|
return ""
|
||||||
|
|
||||||
# Apply different mappings to a text field, like SPIP to Markdown or encoding
|
# Apply different mappings to a text field, like SPIP to Markdown or encoding
|
||||||
def convert(self, text: Optional[str]) -> str:
|
def convert(self, text: Optional[str], clean_html: bool = True) -> str:
|
||||||
# Return unknown char surrounded by context_length chars
|
# Return unknown char surrounded by context_length chars
|
||||||
def unknown_chars_context(text: str, char: str, context_len: int = 24) -> str:
|
def unknown_chars_context(text: str, char: str, context_len: int = 24) -> str:
|
||||||
context: str = r".{0," + str(context_len) + r"}"
|
context: str = r".{0," + str(context_len) + r"}"
|
||||||
@ -84,12 +95,21 @@ class SpipWritable:
|
|||||||
return char
|
return char
|
||||||
|
|
||||||
if text is not None and len(text) > 0:
|
if text is not None and len(text) > 0:
|
||||||
|
# Convert SPIP syntax to Markdown
|
||||||
for spip, markdown in SPIP_MARKDOWN:
|
for spip, markdown in SPIP_MARKDOWN:
|
||||||
text = spip.sub(markdown, text)
|
text = spip.sub(markdown, text)
|
||||||
|
# Remove useless text
|
||||||
for bloat in BLOAT:
|
for bloat in BLOAT:
|
||||||
text = bloat.sub("", text)
|
text = bloat.sub("", text)
|
||||||
|
# Convert broken ISO encoding to UTF
|
||||||
for iso, utf in ISO_UTF:
|
for iso, utf in ISO_UTF:
|
||||||
text = text.replace(iso, utf)
|
text = text.replace(iso, utf)
|
||||||
|
# Handle <multi> multi language blocks
|
||||||
|
text = self.translate(text)
|
||||||
|
# Delete remaining HTML tags in body WARNING
|
||||||
|
if clean_html:
|
||||||
|
text = self.clean_html(text)
|
||||||
|
# Warn about unknown chars
|
||||||
for char in UNKNOWN_ISO:
|
for char in UNKNOWN_ISO:
|
||||||
lastend: int = 0
|
lastend: int = 0
|
||||||
for match in finditer("(" + char + ")+", text):
|
for match in finditer("(" + char + ")+", text):
|
||||||
@ -98,7 +118,6 @@ class SpipWritable:
|
|||||||
f"Unknown char {char} found in {self.titre[:40]} at: {context}"
|
f"Unknown char {char} found in {self.titre[:40]} at: {context}"
|
||||||
)
|
)
|
||||||
lastend = match.end()
|
lastend = match.end()
|
||||||
text = self.translate(text)
|
|
||||||
else:
|
else:
|
||||||
return ""
|
return ""
|
||||||
return text
|
return text
|
||||||
@ -131,7 +150,6 @@ class SpipWritable:
|
|||||||
# Output the remaining number of objects to export every step object
|
# Output the remaining number of objects to export every step object
|
||||||
if index % step == 0:
|
if index % step == 0:
|
||||||
output.append(f"Exporting {limit-index}")
|
output.append(f"Exporting {limit-index}")
|
||||||
if hasattr(self, "profondeur"):
|
|
||||||
output[-1] += f" level {self.profondeur}"
|
output[-1] += f" level {self.profondeur}"
|
||||||
s: str = "s" if limit - index > 1 else ""
|
s: str = "s" if limit - index > 1 else ""
|
||||||
output[-1] += f" {type(self).__name__}{s}"
|
output[-1] += f" {type(self).__name__}{s}"
|
||||||
@ -302,21 +320,12 @@ class SpipObject(SpipWritable):
|
|||||||
body += "\n\n# EXTRA\n\n" + self.extra
|
body += "\n\n# EXTRA\n\n" + self.extra
|
||||||
return body
|
return body
|
||||||
|
|
||||||
# Clean remaining HTML tags in attrs
|
|
||||||
def clean_html(self, *attrs: str) -> None:
|
|
||||||
attrs += "titre", "texte", "descriptif", "extra"
|
|
||||||
for attr in attrs:
|
|
||||||
a = getattr(self, attr)
|
|
||||||
if len(a) > 0:
|
|
||||||
setattr(self, attr, HTMLTAG.sub("", a))
|
|
||||||
|
|
||||||
# Write object to output destination
|
# Write object to output destination
|
||||||
def write(self, parent_dir: str, clean_html: bool = True) -> str:
|
def write(self, parent_dir: str) -> str:
|
||||||
# Link articles
|
# Link articles
|
||||||
self.link_articles()
|
self.link_articles()
|
||||||
# Delete remaining HTML tags WARNING
|
# Convert body after linking articles
|
||||||
if clean_html:
|
self.texte = self.convert(self.texte)
|
||||||
self.clean_html()
|
|
||||||
# Define actual export directory
|
# Define actual export directory
|
||||||
directory: str = parent_dir + self.dir_slug()
|
directory: str = parent_dir + self.dir_slug()
|
||||||
# Make a directory for this object if there isn’t
|
# Make a directory for this object if there isn’t
|
||||||
|
Loading…
Reference in New Issue
Block a user