fix conversion of links + change unknown chars in a configurable string
This commit is contained in:
parent
dd370eb175
commit
fbad1f9563
@ -78,4 +78,5 @@ stored into {esc(BOLD)}{branches}{esc()} directories"""
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Warn about issued warnings in log file
|
# Warn about issued warnings in log file
|
||||||
print(f"\nThere might be warnings and infos in {esc(BOLD)}{CFG.logfile}{esc()}")
|
if isfile(CFG.logfile):
|
||||||
|
print(f"\nTake a look at warnings and infos in {esc(BOLD)}{CFG.logfile}{esc()}")
|
||||||
|
@ -27,6 +27,8 @@ class Configuration:
|
|||||||
export_filetype: str = "md"
|
export_filetype: str = "md"
|
||||||
logfile: str = "spip2md.log"
|
logfile: str = "spip2md.log"
|
||||||
loglevel: str = "WARNING"
|
loglevel: str = "WARNING"
|
||||||
|
unknown_char_replacement: str = "??"
|
||||||
|
alternative_languages = ("fr", "en", "es")
|
||||||
# max_articles_export: int = 1000 # TODO reimplement with recursion
|
# max_articles_export: int = 1000 # TODO reimplement with recursion
|
||||||
# max_sections_export: int = 500 # TODO reimplement with recursion
|
# max_sections_export: int = 500 # TODO reimplement with recursion
|
||||||
|
|
||||||
|
@ -108,32 +108,44 @@ SPIP_MARKDOWN = (
|
|||||||
)
|
)
|
||||||
|
|
||||||
DOCUMENT_LINK = (
|
DOCUMENT_LINK = (
|
||||||
( # SPIP style documents & embeds links
|
( # SPIP style embeds
|
||||||
compile(r"<()(?:doc|document|emb|embed)([0-9]+)(?:\|(.*?))?>", S | I),
|
compile(r"<()(?:doc|document|emb|embed)([0-9]+)(?:\|(.*?))?>", S | I),
|
||||||
r"[{}]({})",
|
r"[{}]({})",
|
||||||
),
|
),
|
||||||
|
( # SPIP style documents & embeds links
|
||||||
|
compile(r"\[ *([^\]]*?) *-> *(?:doc|document|emb|embed)([0-9]+) *\]", S | I),
|
||||||
|
r"[{}]({})",
|
||||||
|
),
|
||||||
( # Markdown style documents & embeds links
|
( # Markdown style documents & embeds links
|
||||||
compile(r"\[(.*?)\]\((?:doc|document|emb|embed)([0-9]+)(?:\|(.*?))?\)", S | I),
|
compile(r"\[(.*?)\]\((?:doc|document|emb|embed)([0-9]+)(?:\|(.*?))?\)", S | I),
|
||||||
r"[\1{}]({})",
|
r"[{}]({})",
|
||||||
),
|
),
|
||||||
( # SPIP style images links
|
( # SPIP style images embeds
|
||||||
compile(r"<()(?:img|image)([0-9]+)(?:\|(.*?))?>", S | I),
|
compile(r"<()(?:img|image)([0-9]+)(?:\|(.*?))?>", S | I),
|
||||||
r"![{}]({})",
|
r"![{}]({})",
|
||||||
),
|
),
|
||||||
|
( # SPIP style image links
|
||||||
|
compile(r"\[ *([^\]]*?) *-> *(?:img|image)([0-9]+) *\]", S | I),
|
||||||
|
r"[{}]({})",
|
||||||
|
),
|
||||||
( # Markdown style images links
|
( # Markdown style images links
|
||||||
compile(r"\[(.*?)\]\((?:img|image)([0-9]+)(?:\|(.*?))?\)", S | I),
|
compile(r"\[(.*?)\]\((?:img|image)([0-9]+)(?:\|(.*?))?\)", S | I),
|
||||||
r"![\1{}]({})",
|
r"![{}]({})",
|
||||||
),
|
),
|
||||||
) # Name and path can be further replaced with .format()
|
) # Name and path can be further replaced with .format()
|
||||||
|
|
||||||
ARTICLE_LINK = (
|
ARTICLE_LINK = (
|
||||||
( # SPIP style documents & embeds links
|
( # SPIP style article embeds
|
||||||
compile(r"<()(?:art|article)([0-9]+)(?:\|(.*?))?>", S | I),
|
compile(r"<()(?:art|article)([0-9]+)(?:\|(.*?))?>", S | I),
|
||||||
r"[{}]({})",
|
r"[{}]({})",
|
||||||
),
|
),
|
||||||
|
( # SPIP style article links
|
||||||
|
compile(r"\[ *([^\]]*?) *-> *(?:art|article)([0-9]+) *\]", S | I),
|
||||||
|
r"[{}]({})",
|
||||||
|
),
|
||||||
( # Markdown style internal links
|
( # Markdown style internal links
|
||||||
compile(r"\[(.*?)\]\((?:art|article)([0-9]+)(?:\|(.*?))?\)", S | I),
|
compile(r"\[(.*?)\]\((?:art|article)([0-9]+)(?:\|(.*?))?\)", S | I),
|
||||||
r"[\1{}]({})",
|
r"[{}]({})",
|
||||||
),
|
),
|
||||||
) # Name and path can be further replaced with .format()
|
) # Name and path can be further replaced with .format()
|
||||||
|
|
||||||
@ -144,7 +156,7 @@ SECTION_LINK = (
|
|||||||
),
|
),
|
||||||
( # Markdown style internal links
|
( # Markdown style internal links
|
||||||
compile(r"\[(.*?)\]\((?:rub|rubrique)([0-9]+)(?:\|(.*?))?\)", S | I),
|
compile(r"\[(.*?)\]\((?:rub|rubrique)([0-9]+)(?:\|(.*?))?\)", S | I),
|
||||||
r"[\1{}]({})",
|
r"[{}]({})",
|
||||||
),
|
),
|
||||||
) # Name and path can be further replaced with .format()
|
) # Name and path can be further replaced with .format()
|
||||||
|
|
||||||
@ -303,5 +315,6 @@ SPECIAL_OUTPUT = (
|
|||||||
WARNING_OUTPUT = (
|
WARNING_OUTPUT = (
|
||||||
compile(r"(ERROR)"), # ERROR
|
compile(r"(ERROR)"), # ERROR
|
||||||
compile(r"(MISSING NAME)"), # MISSING NAME
|
compile(r"(MISSING NAME)"), # MISSING NAME
|
||||||
|
compile(r"(EMPTY NAME)"), # EMPTY NAME
|
||||||
compile(r"(NOT FOUND)"), # NOT FOUND
|
compile(r"(NOT FOUND)"), # NOT FOUND
|
||||||
)
|
)
|
||||||
|
@ -68,7 +68,11 @@ class SpipWritable:
|
|||||||
return MULTILANG_BLOCK.sub(replace_lang, text)
|
return MULTILANG_BLOCK.sub(replace_lang, text)
|
||||||
|
|
||||||
# Apply different mappings to a text field, like SPIP to Markdown or encoding
|
# Apply different mappings to a text field, like SPIP to Markdown or encoding
|
||||||
def convert(self, text: Optional[str], clean_html: bool = True) -> str:
|
def convert(self, text: str, clean_html: bool = True) -> str:
|
||||||
|
if len(text) == 0:
|
||||||
|
# print("Empty text")
|
||||||
|
return ""
|
||||||
|
|
||||||
# Return unknown char surrounded by context_length chars
|
# Return unknown char surrounded by context_length chars
|
||||||
def unknown_chars_context(text: str, char: str, context_len: int = 24) -> str:
|
def unknown_chars_context(text: str, char: str, context_len: int = 24) -> str:
|
||||||
context: str = r".{0," + str(context_len) + r"}"
|
context: str = r".{0," + str(context_len) + r"}"
|
||||||
@ -81,42 +85,44 @@ class SpipWritable:
|
|||||||
else:
|
else:
|
||||||
return char
|
return char
|
||||||
|
|
||||||
if text is not None and len(text) > 0:
|
# Convert SPIP syntax to Markdown
|
||||||
print(f"Converting {text[:40]} from {self.titre}")
|
for spip, markdown in SPIP_MARKDOWN:
|
||||||
# Convert SPIP syntax to Markdown
|
text = spip.sub(markdown, text)
|
||||||
for spip, markdown in SPIP_MARKDOWN:
|
# Remove useless text
|
||||||
text = spip.sub(markdown, text)
|
for bloat in BLOAT:
|
||||||
# Remove useless text
|
text = bloat.sub("", text)
|
||||||
for bloat in BLOAT:
|
# Convert broken ISO encoding to UTF
|
||||||
text = bloat.sub("", text)
|
for iso, utf in ISO_UTF:
|
||||||
# Convert broken ISO encoding to UTF
|
text = text.replace(iso, utf)
|
||||||
for iso, utf in ISO_UTF:
|
# Handle <multi> multi language blocks
|
||||||
text = text.replace(iso, utf)
|
text = self.translate(text)
|
||||||
# Handle <multi> multi language blocks
|
# Delete remaining HTML tags in body WARNING
|
||||||
text = self.translate(text)
|
if clean_html:
|
||||||
# Delete remaining HTML tags in body WARNING
|
text = HTMLTAG.sub("", text)
|
||||||
if clean_html:
|
# Warn about unknown chars
|
||||||
text = HTMLTAG.sub("", text)
|
for char in UNKNOWN_ISO:
|
||||||
# Warn about unknown chars
|
lastend: int = 0
|
||||||
for char in UNKNOWN_ISO:
|
for match in finditer("(" + char + ")+", text):
|
||||||
lastend: int = 0
|
context: str = unknown_chars_context(text[lastend:], char)
|
||||||
for match in finditer("(" + char + ")+", text):
|
logging.warn(
|
||||||
context: str = unknown_chars_context(text[lastend:], char)
|
f"Unknown char {char} found in {self.titre[:40]} at: {context}"
|
||||||
|
)
|
||||||
|
if CFG.unknown_char_replacement is not None:
|
||||||
logging.warn(
|
logging.warn(
|
||||||
f"Unknown char {char} found in {self.titre[:40]} at: {context}"
|
f"Replacing {match.group()} with {CFG.unknown_char_replacement}"
|
||||||
)
|
)
|
||||||
lastend = match.end()
|
text = text.replace(match.group(), CFG.unknown_char_replacement, 1)
|
||||||
else:
|
lastend = match.end()
|
||||||
print("Empty or null text")
|
|
||||||
return ""
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
print(f"Convert titre from {self.titre}")
|
if self.titre is not None:
|
||||||
self.titre: str = self.convert(self.titre)
|
# print(f"Convert titre from {type(self)} {self.titre}")
|
||||||
print(f"Convert descriptif from {self.titre}")
|
self.titre: str = self.convert(self.titre)
|
||||||
self.descriptif: str = self.convert(self.descriptif)
|
if self.descriptif is not None:
|
||||||
|
# print(f"Convert descriptif from {type(self)} {self.titre}")
|
||||||
|
self.descriptif: str = self.convert(self.descriptif)
|
||||||
|
|
||||||
def filename(self, date: bool = False) -> str:
|
def filename(self, date: bool = False) -> str:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
@ -148,10 +154,12 @@ class SpipWritable:
|
|||||||
self.style_print(output[-1])
|
self.style_print(output[-1])
|
||||||
# Output the counter & title of the object being exported
|
# Output the counter & title of the object being exported
|
||||||
output.append(f"{index + 1}. ")
|
output.append(f"{index + 1}. ")
|
||||||
if len(self.titre) > 0:
|
if self.titre is None:
|
||||||
output[-1] += self.titre.strip(" ")
|
|
||||||
else:
|
|
||||||
output[-1] += "MISSING NAME"
|
output[-1] += "MISSING NAME"
|
||||||
|
elif len(self.titre) == 0:
|
||||||
|
output[-1] += "EMPTY NAME"
|
||||||
|
else:
|
||||||
|
output[-1] += self.titre.strip(" ")
|
||||||
# Print the output as the program goes
|
# Print the output as the program goes
|
||||||
self.style_print(output[-1], end="")
|
self.style_print(output[-1], end="")
|
||||||
return output
|
return output
|
||||||
@ -213,59 +221,66 @@ class SpipObject(SpipWritable):
|
|||||||
descriptif: str
|
descriptif: str
|
||||||
extra: str
|
extra: str
|
||||||
|
|
||||||
def convert(self, text: Optional[str], clean_html: bool = True) -> str:
|
def convert(self, text: str, clean_html: bool = True) -> str:
|
||||||
|
if len(text) == 0:
|
||||||
|
# print("Empty text")
|
||||||
|
return ""
|
||||||
|
|
||||||
def found_replace(path_link: str, doc: Any, text: str, match: Match) -> str:
|
def found_replace(path_link: str, doc: Any, text: str, match: Match) -> str:
|
||||||
repl: str = path_link.format(doc.titre, doc.filename())
|
# TODO get relative path
|
||||||
print(f"Translating link to {repl}")
|
if len(match.group(1)) > 0:
|
||||||
|
repl: str = path_link.format(match.group(1), doc.filename())
|
||||||
|
else:
|
||||||
|
repl: str = path_link.format(doc.titre, doc.filename())
|
||||||
|
logging.warn(f"Translating link to {repl}")
|
||||||
return text.replace(match.group(), repl)
|
return text.replace(match.group(), repl)
|
||||||
|
|
||||||
def not_found_warn(path_link: str, text: str, match: Match) -> str:
|
def not_found_warn(path_link: str, text: str, match: Match) -> str:
|
||||||
logging.warn(f"No object for link {match.group()} in {self.titre}")
|
logging.warn(f"No object for link {match.group()} in {self.titre}")
|
||||||
return text.replace(match.group(), path_link.format("", "NOT FOUND"))
|
return text.replace(match.group(), path_link.format("", "NOT FOUND"), 1)
|
||||||
|
|
||||||
if text is not None and len(text) > 0:
|
for id_link, path_link in DOCUMENT_LINK:
|
||||||
for id_link, path_link in DOCUMENT_LINK:
|
# print(f"Looking for links like {id_link}")
|
||||||
print(f"Looking for links like {id_link}")
|
for match in id_link.finditer(text):
|
||||||
for match in id_link.finditer(text):
|
logging.warning(f"Found document link {match.group()} in {self.titre}")
|
||||||
logging.info(f"Found document link {match.group()} in {self.titre}")
|
try:
|
||||||
try:
|
doc: Document = Document.get(Document.id_document == match.group(2))
|
||||||
doc: Document = Document.get(
|
text = found_replace(path_link, doc, text, match)
|
||||||
Document.id_document == match.group(2)
|
except DoesNotExist:
|
||||||
)
|
text = not_found_warn(path_link, text, match)
|
||||||
text = found_replace(path_link, doc, text, match)
|
for id_link, path_link in ARTICLE_LINK:
|
||||||
except DoesNotExist:
|
# print(f"Looking for links like {id_link}")
|
||||||
text = not_found_warn(path_link, text, match)
|
for match in id_link.finditer(text):
|
||||||
for id_link, path_link in ARTICLE_LINK:
|
logging.info(f"Found article link {match.group()} in {self.titre}")
|
||||||
print(f"Looking for links like {id_link}")
|
try:
|
||||||
for match in id_link.finditer(text):
|
art: Article = Article.get(Article.id_article == match.group(2))
|
||||||
logging.info(f"Found article link {match.group()} in {self.titre}")
|
text = found_replace(path_link, art, text, match)
|
||||||
try:
|
except DoesNotExist:
|
||||||
art: Article = Article.get(Article.id_article == match.group(2))
|
text = not_found_warn(path_link, text, match)
|
||||||
text = found_replace(path_link, art, text, match)
|
for id_link, path_link in SECTION_LINK:
|
||||||
except DoesNotExist:
|
# print(f"Looking for links like {id_link}")
|
||||||
text = not_found_warn(path_link, text, match)
|
for match in id_link.finditer(text):
|
||||||
for id_link, path_link in SECTION_LINK:
|
logging.info(f"Found section link {match.group()} in {self.titre}")
|
||||||
print(f"Looking for links like {id_link}")
|
try:
|
||||||
for match in id_link.finditer(text):
|
section: Rubrique = Rubrique.get(
|
||||||
logging.info(f"Found section link {match.group()} in {self.titre}")
|
Rubrique.id_rubrique == match.group(2)
|
||||||
try:
|
)
|
||||||
section: Rubrique = Rubrique.get(
|
text = found_replace(path_link, section, text, match)
|
||||||
Rubrique.id_rubrique == match.group(2)
|
except DoesNotExist:
|
||||||
)
|
text = not_found_warn(path_link, text, match)
|
||||||
text = found_replace(path_link, section, text, match)
|
|
||||||
except DoesNotExist:
|
|
||||||
text = not_found_warn(path_link, text, match)
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
return super().convert(text, clean_html)
|
return super().convert(text, clean_html)
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
# Common fields that need conversions
|
# Common fields that need conversions
|
||||||
print(f"Convert texte from {self.titre}")
|
if self.texte is not None:
|
||||||
self.texte: str = self.convert(self.texte)
|
# print(f"Convert texte from {type(self)} {self.titre}")
|
||||||
print(f"Convert extra from {self.titre}")
|
# print(f"First 500 chars: {self.texte[:500]}")
|
||||||
self.extra: str = self.convert(self.extra)
|
self.texte: str = self.convert(self.texte)
|
||||||
|
if self.extra is not None:
|
||||||
|
# print(f"Convert extra from {type(self)} {self.titre}")
|
||||||
|
# print(f"First 500 chars: {self.extra[:500]}")
|
||||||
|
self.extra: str = self.convert(self.extra)
|
||||||
self.statut: str = "false" if self.statut == "publie" else "true"
|
self.statut: str = "false" if self.statut == "publie" else "true"
|
||||||
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
|
self.langue_choisie: str = "false" if self.langue_choisie == "oui" else "true"
|
||||||
# Define file prefix (needs to be redefined for sections)
|
# Define file prefix (needs to be redefined for sections)
|
||||||
@ -326,14 +341,14 @@ class SpipObject(SpipWritable):
|
|||||||
# Start the content with frontmatter
|
# Start the content with frontmatter
|
||||||
body: str = "---\n" + self.frontmatter() + "---"
|
body: str = "---\n" + self.frontmatter() + "---"
|
||||||
# Add the title as a Markdown h1
|
# Add the title as a Markdown h1
|
||||||
if len(self.titre) > 0 and CFG.prepend_h1:
|
if self.titre is not None and len(self.titre) > 0 and CFG.prepend_h1:
|
||||||
body += "\n\n# " + self.titre
|
body += "\n\n# " + self.titre
|
||||||
# If there is a text, add the text preceded by two line breaks
|
# If there is a text, add the text preceded by two line breaks
|
||||||
if len(self.texte) > 0:
|
if self.texte is not None and len(self.texte) > 0:
|
||||||
# Remove remaining HTML after & append to body
|
# Remove remaining HTML after & append to body
|
||||||
body += "\n\n" + self.texte
|
body += "\n\n" + self.texte
|
||||||
# Same with an "extra" section
|
# Same with an "extra" section
|
||||||
if len(self.extra) > 0:
|
if self.extra is not None and len(self.extra) > 0:
|
||||||
body += "\n\n# EXTRA\n\n" + self.extra
|
body += "\n\n# EXTRA\n\n" + self.extra
|
||||||
return body
|
return body
|
||||||
|
|
||||||
@ -361,10 +376,14 @@ class Article(SpipObject, SpipArticles):
|
|||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
# More conversions needed for articles
|
# More conversions needed for articles
|
||||||
self.surtitre: str = self.convert(self.surtitre)
|
if self.surtitre is not None:
|
||||||
self.soustitre: str = self.convert(self.soustitre)
|
self.surtitre: str = self.convert(self.surtitre)
|
||||||
self.chapo: str = self.convert(self.chapo)
|
if self.soustitre is not None:
|
||||||
self.ps: str = self.convert(self.ps)
|
self.soustitre: str = self.convert(self.soustitre)
|
||||||
|
if self.chapo is not None:
|
||||||
|
self.chapo: str = self.convert(self.chapo)
|
||||||
|
if self.ps is not None:
|
||||||
|
self.ps: str = self.convert(self.ps)
|
||||||
self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
|
self.accepter_forum: str = "true" if self.accepter_forum == "oui" else "false"
|
||||||
# ID
|
# ID
|
||||||
self.object_id = self.id_article
|
self.object_id = self.id_article
|
||||||
|
Loading…
Reference in New Issue
Block a user