diff --git a/spip2md/converter.py b/spip2md/converter.py index 9da8723..0eaeddb 100644 --- a/spip2md/converter.py +++ b/spip2md/converter.py @@ -253,32 +253,30 @@ isoToUtf = ( ## WARNING unknown broken encoding unknownIso = ( - compile(r"\w*
.*\r?\n"), # unknown 
 + surroundings - compile(r"\w*∆.*\r?\n"), # unknown â^† + surroundings + ( # unknown 
 + surroundings + compile(r"
"), + compile(r"
.*(?=\r?\n|$)"), + ), + ( # unknown â^† + surroundings + compile(r"∆"), + compile(r"∆.*(?=\r?\n|$)"), + ), ) def convertBody(spipBody): - text = spipBody - errors = [] + text: str = spipBody for spip, markdown in spipToMarkdown: text = spip.sub(markdown, text) for iso, utf in isoToUtf: text = iso.sub(utf, text) - for iso in unknownIso: - for match in iso.finditer(text): - errors.append(match.group()) - return text, errors + return text def convertMeta(spipMeta): - text = spipMeta - errors = [] + text: str = spipMeta for spip, metadata in spipToText: text = spip.sub(metadata, text) for iso, utf in isoToUtf: text = iso.sub(utf, text) - for iso in unknownIso: - for match in iso.finditer(text): - errors.append(match.group()) - return text, errors + return text diff --git a/spip2md/iterator.py b/spip2md/iterator.py index 3ac9990..e23a4e7 100644 --- a/spip2md/iterator.py +++ b/spip2md/iterator.py @@ -1,6 +1,6 @@ -from array import array +from re import escape -from converter import convertBody, convertMeta +from converter import convertBody, convertMeta, unknownIso from database import * from slugify import slugify # from yaml import CDumper as Dumper @@ -11,12 +11,12 @@ class Article: def __init__(self, article): self.id = article.id_article # self.surtitle = article.surtitre # Probably unused - self.title, self.title_unknown = convertMeta(article.titre) + self.title = convertMeta(article.titre) self.subtitle = article.soustitre # Probably unused # self.section = article.id_rubrique # TODO join - self.description, self.description_unknown = convertMeta(article.descriptif) + self.description = convertMeta(article.descriptif) self.caption = article.chapo # Probably unused - self.text, self.text_unknown = convertBody(article.texte) # Markdown + self.text = convertBody(article.texte) # Markdown self.ps = article.ps # Probably unused self.publicationDate = article.date self.draft = False if article.statut == "publie" else True @@ -38,16 +38,16 @@ class Article: self.virtual = article.virtuel # TODO Why ? self.microblog = article.microblog # Probably unused - def get_slug(self): + def getSlug(self): return slugify(f"{self.id}-{self.title}") - def get_path(self): - return self.get_slug() + def getPath(self): + return self.getSlug() - def get_authors(self): + def getAuthors(self): return SpipAuteursLiens.select().where(SpipAuteursLiens.id_objet == self.id) - def get_frontmatter(self): + def getFrontmatter(self): return dump( { "lang": self.lang, @@ -58,14 +58,14 @@ class Article: "lastmod": self.update, "draft": self.draft, "description": self.description, - "authors": [author.id_auteur for author in self.get_authors()], + "authors": [author.id_auteur for author in self.getAuthors()], }, allow_unicode=True, ) - def get_article(self): + def getArticle(self): # Build the final article text - article: str = "---\n" + self.get_frontmatter() + "---" + article: str = "---\n" + self.getFrontmatter() + "---" # If there is a caption, add the caption followed by a hr if len(self.caption) > 0: article += "\n\n" + self.caption + "\n\n***" @@ -83,10 +83,34 @@ class Article: article += "\n\n# MICROBLOGGING\n\n" + self.microblog return article + def getUnknownChars(self): + errors: list = [] + for text in (self.title, self.text): + for _, surrounding in unknownIso: + for match in surrounding.finditer(text): + errors.append(match.group()) + return errors + + +def highlightUnknownChars(text): + # Define terminal escape sequences to stylize output, regex escaped + COLOR = "\033[91m" + "\033[1m" # Red + Bold + RESET = "\033[0m" + # Highlight in COLOR unknown chars in text + for char, _ in unknownIso: + for match in char.finditer(text): + text = ( + text[: match.start()] + + COLOR + + match.group() + + RESET + + text[match.end() :] + ) + return text + class Articles: exported: int = 0 - unknownChars: list = [] def __init__(self, maxToExport) -> None: # Query the DB to retrieve all articles sorted by publication date @@ -103,9 +127,10 @@ class Articles: def __next__(self): if self.remaining() <= 0: - raise StopIteration() + raise StopIteration self.exported += 1 + article = Article(self.articles[self.exported - 1]) return ( {"exported": self.exported, "remaining": self.remaining()}, - Article(self.articles[self.exported - 1]), + article, ) diff --git a/spip2md/main.py b/spip2md/main.py index eade914..2c31beb 100755 --- a/spip2md/main.py +++ b/spip2md/main.py @@ -1,7 +1,7 @@ #!python from config import config from database import db -from iterator import Articles +from iterator import Articles, highlightUnknownChars if __name__ != "__main__": exit() @@ -25,12 +25,13 @@ else: maxToExport = config.defaultNbToExport # Define terminal escape sequences to stylize output -R = "\033[91m" -G = "\033[92m" -B = "\033[94m" -BOLD = "\033[1m" -UNDERLINE = "\033[4m" -RESET = "\033[0m" +R: str = "\033[91m" +G: str = "\033[92m" +B: str = "\033[94m" +BOLD: str = "\033[1m" +RESET: str = "\033[0m" + +unknownChars: dict = {} # Loop among first maxToExport articles & export them for counter, article in Articles(maxToExport): @@ -39,12 +40,27 @@ for counter, article in Articles(maxToExport): f"\n{BOLD}Exporting {R}{counter['remaining']+1}{RESET}" + f"{BOLD} SPIP articles to Markdown & YAML files{RESET}\n" ) - print(f"{BOLD}{counter['exported']}.{RESET} {article.title}") - fullPath = config.outputDir + "/" + article.get_path() - print(f"\t-> {fullPath}/index.md") + print( + f"{BOLD}{counter['exported']}.{RESET} " + highlightUnknownChars(article.title) + ) + fullPath = config.outputDir + "/" + article.getPath() + print(f"{BOLD}>{RESET} {fullPath}/index.md") mkdir(fullPath) with open(fullPath + "/index.md", "w") as f: - f.write(article.get_article()) + f.write(article.getArticle()) + # Store detected unknown characters + if len(article.getUnknownChars()) > 0: + unknownChars[article.title] = article.getUnknownChars() + +for title in unknownChars: + nb = len(unknownChars[title]) + print( + f"\n{BOLD}{nb} " + + f"unknown character{'s' if nb > 1 else ''} detected in{RESET} " + + highlightUnknownChars(title) + ) + for text in unknownChars[title]: + print(f" {BOLD}…{RESET} " + highlightUnknownChars(text)) # Close the database connection db.close()