fix conversion of links + change unknown chars in a configurable string

This commit is contained in:
Guilhem Fauré 2023-05-30 17:48:54 +02:00
parent dd370eb175
commit fbad1f9563
4 changed files with 126 additions and 91 deletions

View File

@ -78,4 +78,5 @@ stored into {esc(BOLD)}{branches}{esc()} directories"""
) )
# Warn about issued warnings in log file # Warn about issued warnings in log file
print(f"\nThere might be warnings and infos in {esc(BOLD)}{CFG.logfile}{esc()}") if isfile(CFG.logfile):
print(f"\nTake a look at warnings and infos in {esc(BOLD)}{CFG.logfile}{esc()}")

View File

@ -27,6 +27,8 @@ class Configuration:
export_filetype: str = "md" export_filetype: str = "md"
logfile: str = "spip2md.log" logfile: str = "spip2md.log"
loglevel: str = "WARNING" loglevel: str = "WARNING"
unknown_char_replacement: str = "??"
alternative_languages = ("fr", "en", "es")
# max_articles_export: int = 1000 # TODO reimplement with recursion # max_articles_export: int = 1000 # TODO reimplement with recursion
# max_sections_export: int = 500 # TODO reimplement with recursion # max_sections_export: int = 500 # TODO reimplement with recursion

View File

@ -108,32 +108,44 @@ SPIP_MARKDOWN = (
) )
DOCUMENT_LINK = ( DOCUMENT_LINK = (
( # SPIP style documents & embeds links ( # SPIP style embeds
compile(r"<()(?:doc|document|emb|embed)([0-9]+)(?:\|(.*?))?>", S | I), compile(r"<()(?:doc|document|emb|embed)([0-9]+)(?:\|(.*?))?>", S | I),
r"[{}]({})", r"[{}]({})",
), ),
( # SPIP style documents & embeds links
compile(r"\[ *([^\]]*?) *-> *(?:doc|document|emb|embed)([0-9]+) *\]", S | I),
r"[{}]({})",
),
( # Markdown style documents & embeds links ( # Markdown style documents & embeds links
compile(r"\[(.*?)\]\((?:doc|document|emb|embed)([0-9]+)(?:\|(.*?))?\)", S | I), compile(r"\[(.*?)\]\((?:doc|document|emb|embed)([0-9]+)(?:\|(.*?))?\)", S | I),
r"[\1{}]({})", r"[{}]({})",
), ),
( # SPIP style images links ( # SPIP style images embeds
compile(r"<()(?:img|image)([0-9]+)(?:\|(.*?))?>", S | I), compile(r"<()(?:img|image)([0-9]+)(?:\|(.*?))?>", S | I),
r"![{}]({})", r"![{}]({})",
), ),
( # SPIP style image links
compile(r"\[ *([^\]]*?) *-> *(?:img|image)([0-9]+) *\]", S | I),
r"[{}]({})",
),
( # Markdown style images links ( # Markdown style images links
compile(r"\[(.*?)\]\((?:img|image)([0-9]+)(?:\|(.*?))?\)", S | I), compile(r"\[(.*?)\]\((?:img|image)([0-9]+)(?:\|(.*?))?\)", S | I),
r"![\1{}]({})", r"![{}]({})",
), ),
) # Name and path can be further replaced with .format() ) # Name and path can be further replaced with .format()
ARTICLE_LINK = ( ARTICLE_LINK = (
( # SPIP style documents & embeds links ( # SPIP style article embeds
compile(r"<()(?:art|article)([0-9]+)(?:\|(.*?))?>", S | I), compile(r"<()(?:art|article)([0-9]+)(?:\|(.*?))?>", S | I),
r"[{}]({})", r"[{}]({})",
), ),
( # SPIP style article links
compile(r"\[ *([^\]]*?) *-> *(?:art|article)([0-9]+) *\]", S | I),
r"[{}]({})",
),
( # Markdown style internal links ( # Markdown style internal links
compile(r"\[(.*?)\]\((?:art|article)([0-9]+)(?:\|(.*?))?\)", S | I), compile(r"\[(.*?)\]\((?:art|article)([0-9]+)(?:\|(.*?))?\)", S | I),
r"[\1{}]({})", r"[{}]({})",
), ),
) # Name and path can be further replaced with .format() ) # Name and path can be further replaced with .format()
@ -144,7 +156,7 @@ SECTION_LINK = (
), ),
( # Markdown style internal links ( # Markdown style internal links
compile(r"\[(.*?)\]\((?:rub|rubrique)([0-9]+)(?:\|(.*?))?\)", S | I), compile(r"\[(.*?)\]\((?:rub|rubrique)([0-9]+)(?:\|(.*?))?\)", S | I),
r"[\1{}]({})", r"[{}]({})",
), ),
) # Name and path can be further replaced with .format() ) # Name and path can be further replaced with .format()
@ -303,5 +315,6 @@ SPECIAL_OUTPUT = (
WARNING_OUTPUT = ( WARNING_OUTPUT = (
compile(r"(ERROR)"), # ERROR compile(r"(ERROR)"), # ERROR
compile(r"(MISSING NAME)"), # MISSING NAME compile(r"(MISSING NAME)"), # MISSING NAME
compile(r"(EMPTY NAME)"), # EMPTY NAME
compile(r"(NOT FOUND)"), # NOT FOUND compile(r"(NOT FOUND)"), # NOT FOUND
) )

View File

@ -68,7 +68,11 @@ class SpipWritable:
return MULTILANG_BLOCK.sub(replace_lang, text) return MULTILANG_BLOCK.sub(replace_lang, text)
# Apply different mappings to a text field, like SPIP to Markdown or encoding # Apply different mappings to a text field, like SPIP to Markdown or encoding
def convert(self, text: Optional[str], clean_html: bool = True) -> str: def convert(self, text: str, clean_html: bool = True) -> str:
if len(text) == 0:
# print("Empty text")
return ""
# Return unknown char surrounded by context_length chars # Return unknown char surrounded by context_length chars
def unknown_chars_context(text: str, char: str, context_len: int = 24) -> str: def unknown_chars_context(text: str, char: str, context_len: int = 24) -> str:
context: str = r".{0," + str(context_len) + r"}" context: str = r".{0," + str(context_len) + r"}"
@ -81,42 +85,44 @@ class SpipWritable:
else: else:
return char return char
if text is not None and len(text) > 0: # Convert SPIP syntax to Markdown
print(f"Converting {text[:40]} from {self.titre}") for spip, markdown in SPIP_MARKDOWN:
# Convert SPIP syntax to Markdown text = spip.sub(markdown, text)
for spip, markdown in SPIP_MARKDOWN: # Remove useless text
text = spip.sub(markdown, text) for bloat in BLOAT:
# Remove useless text text = bloat.sub("", text)
for bloat in BLOAT: # Convert broken ISO encoding to UTF
text = bloat.sub("", text) for iso, utf in ISO_UTF:
# Convert broken ISO encoding to UTF text = text.replace(iso, utf)
for iso, utf in ISO_UTF: # Handle <multi> multi language blocks
text = text.replace(iso, utf) text = self.translate(text)
# Handle <multi> multi language blocks # Delete remaining HTML tags in body WARNING
text = self.translate(text) if clean_html:
# Delete remaining HTML tags in body WARNING text = HTMLTAG.sub("", text)
if clean_html: # Warn about unknown chars
text = HTMLTAG.sub("", text) for char in UNKNOWN_ISO:
# Warn about unknown chars lastend: int = 0
for char in UNKNOWN_ISO: for match in finditer("(" + char + ")+", text):
lastend: int = 0 context: str = unknown_chars_context(text[lastend:], char)
for match in finditer("(" + char + ")+", text): logging.warn(
context: str = unknown_chars_context(text[lastend:], char) f"Unknown char {char} found in {self.titre[:40]} at: {context}"
)
if CFG.unknown_char_replacement is not None:
logging.warn( logging.warn(
f"Unknown char {char} found in {self.titre[:40]} at: {context}" f"Replacing {match.group()} with {CFG.unknown_char_replacement}"
) )
lastend = match.end() text = text.replace(match.group(), CFG.unknown_char_replacement, 1)
else: lastend = match.end()
print("Empty or null text")
return ""
return text return text
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
print(f"Convert titre from {self.titre}") if self.titre is not None:
self.titre: str = self.convert(self.titre) # print(f"Convert titre from {type(self)} {self.titre}")
print(f"Convert descriptif from {self.titre}") self.titre: str = self.convert(self.titre)
self.descriptif: str = self.convert(self.descriptif) if self.descriptif is not None:
# print(f"Convert descriptif from {type(self)} {self.titre}")
self.descriptif: str = self.convert(self.descriptif)
def filename(self, date: bool = False) -> str: def filename(self, date: bool = False) -> str:
raise NotImplementedError( raise NotImplementedError(
@ -148,10 +154,12 @@ class SpipWritable:
self.style_print(output[-1]) self.style_print(output[-1])
# Output the counter & title of the object being exported # Output the counter & title of the object being exported
output.append(f"{index + 1}. ") output.append(f"{index + 1}. ")
if len(self.titre) > 0: if self.titre is None:
output[-1] += self.titre.strip(" ")
else:
output[-1] += "MISSING NAME" output[-1] += "MISSING NAME"
elif len(self.titre) == 0:
output[-1] += "EMPTY NAME"
else:
output[-1] += self.titre.strip(" ")
# Print the output as the program goes # Print the output as the program goes
self.style_print(output[-1], end="") self.style_print(output[-1], end="")
return output return output
@ -213,59 +221,66 @@ class SpipObject(SpipWritable):
descriptif: str descriptif: str
extra: str extra: str
def convert(self, text: Optional[str], clean_html: bool = True) -> str: def convert(self, text: str, clean_html: bool = True) -> str:
if len(text) == 0:
# print("Empty text")
return ""
def found_replace(path_link: str, doc: Any, text: str, match: Match) -> str: def found_replace(path_link: str, doc: Any, text: str, match: Match) -> str:
repl: str = path_link.format(doc.titre, doc.filename()) # TODO get relative path
print(f"Translating link to {repl}") if len(match.group(1)) > 0:
repl: str = path_link.format(match.group(1), doc.filename())
else:
repl: str = path_link.format(doc.titre, doc.filename())
logging.warn(f"Translating link to {repl}")
return text.replace(match.group(), repl) return text.replace(match.group(), repl)
def not_found_warn(path_link: str, text: str, match: Match) -> str: def not_found_warn(path_link: str, text: str, match: Match) -> str:
logging.warn(f"No object for link {match.group()} in {self.titre}") logging.warn(f"No object for link {match.group()} in {self.titre}")
return text.replace(match.group(), path_link.format("", "NOT FOUND")) return text.replace(match.group(), path_link.format("", "NOT FOUND"), 1)
if text is not None and len(text) > 0: for id_link, path_link in DOCUMENT_LINK:
for id_link, path_link in DOCUMENT_LINK: # print(f"Looking for links like {id_link}")
print(f"Looking for links like {id_link}") for match in id_link.finditer(text):
for match in id_link.finditer(text): logging.warning(f"Found document link {match.group()} in {self.titre}")
logging.info(f"Found document link {match.group()} in {self.titre}") try:
try: doc: Document = Document.get(Document.id_document == match.group(2))
doc: Document = Document.get( text = found_replace(path_link, doc, text, match)
Document.id_document == match.group(2) except DoesNotExist:
) text = not_found_warn(path_link, text, match)
text = found_replace(path_link, doc, text, match) for id_link, path_link in ARTICLE_LINK:
except DoesNotExist: # print(f"Looking for links like {id_link}")
text = not_found_warn(path_link, text, match) for match in id_link.finditer(text):
for id_link, path_link in ARTICLE_LINK: logging.info(f"Found article link {match.group()} in {self.titre}")
print(f"Looking for links like {id_link}") try:
for match in id_link.finditer(text): art: Article = Article.get(Article.id_article == match.group(2))
logging.info(f"Found article link {match.group()} in {self.titre}") text = found_replace(path_link, art, text, match)
try: except DoesNotExist:
art: Article = Article.get(Article.id_article == match.group(2)) text = not_found_warn(path_link, text, match)
text = found_replace(path_link, art, text, match) for id_link, path_link in SECTION_LINK:
except DoesNotExist: # print(f"Looking for links like {id_link}")
text = not_found_warn(path_link, text, match) for match in id_link.finditer(text):
for id_link, path_link in SECTION_LINK: logging.info(f"Found section link {match.group()} in {self.titre}")
print(f"Looking for links like {id_link}") try:
for match in id_link.finditer(text): section: Rubrique = Rubrique.get(
logging.info(f"Found section link {match.group()} in {self.titre}") Rubrique.id_rubrique == match.group(2)
try: )
section: Rubrique = Rubrique.get( text = found_replace(path_link, section, text, match)
Rubrique.id_rubrique == match.group(2) except DoesNotExist:
) text = not_found_warn(path_link, text, match)
text = found_replace(path_link, section, text, match)
except DoesNotExist:
text = not_found_warn(path_link, text, match)
else:
return ""
return super().convert(text, clean_html) return super().convert(text, clean_html)
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
# Common fields that need conversions # Common fields that need conversions
print(f"Convert texte from {self.titre}") if self.texte is not None:
self.texte: str = self.convert(self.texte) # print(f"Convert texte from {type(self)} {self.titre}")
print(f"Convert extra from {self.titre}") # print(f"First 500 chars: {self.texte[:500]}")
self.extra: str = self.convert(self.extra) self.texte: str = self.convert(self.texte)
if self.extra is not None:
# print(f"Convert extra from {type(self)} {self.titre}")
# print(f"First 500 chars: {self.extra[:500]}")
self.extra: str = self.convert(self.extra)
self.statut: str = "false" if self.statut == "publie" else "true" self.statut: str = "false" if self.statut == "publie" else "true"
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true" self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
# Define file prefix (needs to be redefined for sections) # Define file prefix (needs to be redefined for sections)
@ -326,14 +341,14 @@ class SpipObject(SpipWritable):
# Start the content with frontmatter # Start the content with frontmatter
body: str = "---\n" + self.frontmatter() + "---" body: str = "---\n" + self.frontmatter() + "---"
# Add the title as a Markdown h1 # Add the title as a Markdown h1
if len(self.titre) > 0 and CFG.prepend_h1: if self.titre is not None and len(self.titre) > 0 and CFG.prepend_h1:
body += "\n\n# " + self.titre body += "\n\n# " + self.titre
# If there is a text, add the text preceded by two line breaks # If there is a text, add the text preceded by two line breaks
if len(self.texte) > 0: if self.texte is not None and len(self.texte) > 0:
# Remove remaining HTML after & append to body # Remove remaining HTML after & append to body
body += "\n\n" + self.texte body += "\n\n" + self.texte
# Same with an "extra" section # Same with an "extra" section
if len(self.extra) > 0: if self.extra is not None and len(self.extra) > 0:
body += "\n\n# EXTRA\n\n" + self.extra body += "\n\n# EXTRA\n\n" + self.extra
return body return body
@ -361,10 +376,14 @@ class Article(SpipObject, SpipArticles):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
# More conversions needed for articles # More conversions needed for articles
self.surtitre: str = self.convert(self.surtitre) if self.surtitre is not None:
self.soustitre: str = self.convert(self.soustitre) self.surtitre: str = self.convert(self.surtitre)
self.chapo: str = self.convert(self.chapo) if self.soustitre is not None:
self.ps: str = self.convert(self.ps) self.soustitre: str = self.convert(self.soustitre)
if self.chapo is not None:
self.chapo: str = self.convert(self.chapo)
if self.ps is not None:
self.ps: str = self.convert(self.ps)
self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false" self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
# ID # ID
self.object_id = self.id_article self.object_id = self.id_article