refactoring
This commit is contained in:
parent
a0fd7f0efa
commit
a83ec1da3c
@ -14,7 +14,7 @@ class Configuration:
|
|||||||
db_user = "spip"
|
db_user = "spip"
|
||||||
db_pass = "password"
|
db_pass = "password"
|
||||||
output_dir = "output"
|
output_dir = "output"
|
||||||
default_export_nb = 1000
|
default_export_max = 1000
|
||||||
|
|
||||||
def __init__(self, config_file: Optional[str] = None) -> None:
|
def __init__(self, config_file: Optional[str] = None) -> None:
|
||||||
if config_file is not None:
|
if config_file is not None:
|
||||||
@ -29,7 +29,7 @@ class Configuration:
|
|||||||
if "output_dir" in config:
|
if "output_dir" in config:
|
||||||
self.output_dir = config["output_dir"]
|
self.output_dir = config["output_dir"]
|
||||||
if "default_export_nb" in config:
|
if "default_export_nb" in config:
|
||||||
self.default_export_nb = config["default_export_nb"]
|
self.default_export_max = config["default_export_max"]
|
||||||
|
|
||||||
|
|
||||||
config = Configuration()
|
config = Configuration()
|
||||||
|
@ -288,3 +288,11 @@ def highlight_unknown_chars(text: str, pre: str, post: str) -> str:
|
|||||||
text[: match.start()] + pre + match.group() + post + text[match.end() :]
|
text[: match.start()] + pre + match.group() + post + text[match.end() :]
|
||||||
)
|
)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def get_unknown_chars(text: str) -> list[str]:
|
||||||
|
errors: list[str] = []
|
||||||
|
for char in unknown_iso:
|
||||||
|
for match in finditer(r".{0-20}" + char + r".*(?=\r?\n|$)", text):
|
||||||
|
errors.append(match.group())
|
||||||
|
return errors
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
# pyright: basic
|
# pyright: basic
|
||||||
from re import finditer
|
from typing import Any, Optional
|
||||||
|
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
from yaml import dump
|
from yaml import dump
|
||||||
|
|
||||||
from converter import convert_body, convert_meta, unknown_iso
|
from converter import convert_body, convert_meta
|
||||||
from database import SpipArticles, SpipAuteurs, SpipAuteursLiens, SpipRubriques
|
from database import SpipArticles, SpipAuteurs, SpipAuteursLiens, SpipRubriques
|
||||||
|
|
||||||
# from yaml import CDumper as Dumper
|
# from yaml import CDumper as Dumper
|
||||||
@ -35,7 +35,7 @@ class Item:
|
|||||||
def get_filename(self) -> str:
|
def get_filename(self) -> str:
|
||||||
return "index" + "." + self.lang + "." + FILETYPE
|
return "index" + "." + self.lang + "." + FILETYPE
|
||||||
|
|
||||||
def get_frontmatter(self) -> str:
|
def get_frontmatter(self, append: Optional[dict[str, Any]] = None) -> str:
|
||||||
return dump(
|
return dump(
|
||||||
{
|
{
|
||||||
"lang": self.lang,
|
"lang": self.lang,
|
||||||
@ -48,40 +48,29 @@ class Item:
|
|||||||
# Debugging
|
# Debugging
|
||||||
"spip_id": self.id,
|
"spip_id": self.id,
|
||||||
"spip_id_secteur": self.sector_id,
|
"spip_id_secteur": self.sector_id,
|
||||||
},
|
}
|
||||||
|
| append
|
||||||
|
if append is not None
|
||||||
|
else {},
|
||||||
allow_unicode=True,
|
allow_unicode=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_content(self) -> str:
|
def get_body(self) -> str:
|
||||||
# Build the final article text
|
body: str = ""
|
||||||
article: str = "---\n" + self.get_frontmatter() + "---"
|
|
||||||
# If there is a caption, add the caption followed by a hr
|
|
||||||
if hasattr(self, "caption") and len(self.caption) > 0:
|
|
||||||
article += "\n\n" + self.caption + "\n\n***"
|
|
||||||
# Add the title as a Markdown h1
|
# Add the title as a Markdown h1
|
||||||
if len(self.title) > 0:
|
if len(self.title) > 0:
|
||||||
article += "\n\n# " + self.title
|
body += "\n\n# " + self.title
|
||||||
# If there is a text, add the text preceded by two line breaks
|
# If there is a text, add the text preceded by two line breaks
|
||||||
if len(self.text) > 0:
|
if len(self.text) > 0:
|
||||||
article += "\n\n" + self.text
|
body += "\n\n" + self.text
|
||||||
# Same with an "extra" section
|
# Same with an "extra" section
|
||||||
if self.extra is not None and len(self.extra) > 0:
|
if self.extra is not None and len(self.extra) > 0:
|
||||||
article += "\n\n# EXTRA\n\n" + self.extra
|
body += "\n\n# EXTRA\n\n" + self.extra
|
||||||
# PS
|
return body
|
||||||
if hasattr(self, "ps") and len(self.ps) > 0:
|
|
||||||
article += "\n\n# POST-SCRIPTUM\n\n" + self.ps
|
|
||||||
# Microblog
|
|
||||||
if hasattr(self, "microblog") and len(self.microblog) > 0:
|
|
||||||
article += "\n\n# MICROBLOGGING\n\n" + self.microblog
|
|
||||||
return article
|
|
||||||
|
|
||||||
def get_unknown_chars(self) -> list[str]:
|
def get_content(self) -> str:
|
||||||
errors: list[str] = []
|
# Return the final article text
|
||||||
for text in (self.title, self.text):
|
return "---\n" + self.get_frontmatter() + "---" + self.get_body()
|
||||||
for char in unknown_iso:
|
|
||||||
for match in finditer(r".{0-20}" + char + r".*(?=\r?\n|$)", text):
|
|
||||||
errors.append(match.group())
|
|
||||||
return errors
|
|
||||||
|
|
||||||
|
|
||||||
class Article(Item):
|
class Article(Item):
|
||||||
@ -115,28 +104,32 @@ class Article(Item):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def get_frontmatter(self) -> str:
|
def get_frontmatter(self) -> str:
|
||||||
return dump(
|
return super().get_frontmatter(
|
||||||
{
|
{
|
||||||
"lang": self.lang,
|
|
||||||
"translationKey": self.translation_key,
|
|
||||||
"title": self.title,
|
|
||||||
"surtitle": self.surtitle,
|
"surtitle": self.surtitle,
|
||||||
"subtitle": self.subtitle,
|
"subtitle": self.subtitle,
|
||||||
"date": self.creation,
|
"date": self.creation,
|
||||||
"publishDate": self.publication,
|
|
||||||
"lastmod": self.update,
|
|
||||||
"draft": self.draft,
|
|
||||||
"description": self.description,
|
|
||||||
"authors": [author.nom for author in self.get_authors()],
|
"authors": [author.nom for author in self.get_authors()],
|
||||||
# Debugging
|
# Debugging
|
||||||
"spip_id_article": self.id,
|
|
||||||
"spip_id_rubrique": self.section_id,
|
"spip_id_rubrique": self.section_id,
|
||||||
"spip_id_secteur": self.sector_id,
|
"spip_id_secteur": self.sector_id,
|
||||||
"spip_chapo": self.caption,
|
"spip_chapo": self.caption,
|
||||||
},
|
},
|
||||||
allow_unicode=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def get_body(self) -> str:
|
||||||
|
body: str = super().get_body()
|
||||||
|
# If there is a caption, add the caption followed by a hr
|
||||||
|
if hasattr(self, "caption") and len(self.caption) > 0:
|
||||||
|
body += "\n\n" + self.caption + "\n\n***"
|
||||||
|
# PS
|
||||||
|
if hasattr(self, "ps") and len(self.ps) > 0:
|
||||||
|
body += "\n\n# POST-SCRIPTUM\n\n" + self.ps
|
||||||
|
# Microblog
|
||||||
|
if hasattr(self, "microblog") and len(self.microblog) > 0:
|
||||||
|
body += "\n\n# MICROBLOGGING\n\n" + self.microblog
|
||||||
|
return body
|
||||||
|
|
||||||
|
|
||||||
class Section(Item):
|
class Section(Item):
|
||||||
def __init__(self, section) -> None:
|
def __init__(self, section) -> None:
|
||||||
@ -146,6 +139,9 @@ class Section(Item):
|
|||||||
self.depth: int = section.profondeur
|
self.depth: int = section.profondeur
|
||||||
self.agenda: int = section.agenda
|
self.agenda: int = section.agenda
|
||||||
|
|
||||||
|
def get_filename(self) -> str:
|
||||||
|
return "_" + super().get_filename()
|
||||||
|
|
||||||
def get_articles(self, limit: int = 0):
|
def get_articles(self, limit: int = 0):
|
||||||
return Articles(self.id, limit)
|
return Articles(self.id, limit)
|
||||||
|
|
||||||
@ -168,7 +164,7 @@ class LimitCounter:
|
|||||||
return self.count
|
return self.count
|
||||||
|
|
||||||
|
|
||||||
class Items:
|
class Iterator:
|
||||||
items: list
|
items: list
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
@ -182,7 +178,7 @@ class Items:
|
|||||||
return self.count.LIMIT
|
return self.count.LIMIT
|
||||||
|
|
||||||
|
|
||||||
class Articles(Items):
|
class Articles(Iterator):
|
||||||
def __init__(self, section_id: int, limit: int = 0) -> None:
|
def __init__(self, section_id: int, limit: int = 0) -> None:
|
||||||
# Query the DB to retrieve all articles sorted by publication date
|
# Query the DB to retrieve all articles sorted by publication date
|
||||||
if limit > 0:
|
if limit > 0:
|
||||||
@ -193,14 +189,18 @@ class Articles(Items):
|
|||||||
.limit(limit)
|
.limit(limit)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.items = SpipArticles.select().order_by(SpipArticles.date.desc())
|
self.items = (
|
||||||
|
SpipArticles.select()
|
||||||
|
.where(SpipArticles.id_rubrique == section_id)
|
||||||
|
.order_by(SpipArticles.date.desc())
|
||||||
|
)
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
def __next__(self):
|
def __next__(self):
|
||||||
return (Article(self.items[self.count.step()]), self.count)
|
return (Article(self.items[self.count.step()]), self.count)
|
||||||
|
|
||||||
|
|
||||||
class Sections(Items):
|
class Sections(Iterator):
|
||||||
def __init__(self, limit: int = 0) -> None:
|
def __init__(self, limit: int = 0) -> None:
|
||||||
# Query the DB to retrieve all sections sorted by publication date
|
# Query the DB to retrieve all sections sorted by publication date
|
||||||
if limit > 0:
|
if limit > 0:
|
@ -5,9 +5,9 @@ from shutil import rmtree
|
|||||||
from sys import argv
|
from sys import argv
|
||||||
|
|
||||||
from config import config
|
from config import config
|
||||||
from converter import highlight_unknown_chars
|
from converter import get_unknown_chars, highlight_unknown_chars
|
||||||
from database import db
|
from database import db
|
||||||
from lib import Article, Sections
|
from items import Article, Sections
|
||||||
|
|
||||||
# Define terminal escape sequences to stylize output
|
# Define terminal escape sequences to stylize output
|
||||||
R: str = "\033[91m"
|
R: str = "\033[91m"
|
||||||
@ -22,10 +22,10 @@ db.connect()
|
|||||||
|
|
||||||
if __name__ == "__main__": # Following is executed only if script is directly executed
|
if __name__ == "__main__": # Following is executed only if script is directly executed
|
||||||
# Define max nb of articles to export based on first CLI argument
|
# Define max nb of articles to export based on first CLI argument
|
||||||
if len(argv) > 1:
|
if len(argv) >= 2:
|
||||||
maxexport = int(argv[1])
|
toexport = int(argv[1])
|
||||||
else:
|
else:
|
||||||
maxexport = config.default_export_nb
|
toexport = config.default_export_max
|
||||||
|
|
||||||
# Clear the output dir & create a new
|
# Clear the output dir & create a new
|
||||||
rmtree(config.output_dir, True)
|
rmtree(config.output_dir, True)
|
||||||
@ -36,6 +36,10 @@ if __name__ == "__main__": # Following is executed only if script is directly e
|
|||||||
|
|
||||||
# Loop among first maxexport articles & export them
|
# Loop among first maxexport articles & export them
|
||||||
for section, counter in Sections():
|
for section, counter in Sections():
|
||||||
|
# Define articles of the sections, limited by toexport
|
||||||
|
if toexport <= 0:
|
||||||
|
break
|
||||||
|
articles = section.get_articles(toexport)
|
||||||
# Print the name of the exported section & number of remaining sections
|
# Print the name of the exported section & number of remaining sections
|
||||||
print(
|
print(
|
||||||
f"{BOLD}{counter.count + 1}. {RESET}"
|
f"{BOLD}{counter.count + 1}. {RESET}"
|
||||||
@ -44,11 +48,16 @@ if __name__ == "__main__": # Following is executed only if script is directly e
|
|||||||
)
|
)
|
||||||
if counter.remaining() > 2:
|
if counter.remaining() > 2:
|
||||||
print(
|
print(
|
||||||
f" {BOLD}{R}{counter.remaining()-1}{RESET} {BOLD}sections left"
|
f" {BOLD}{B}{counter.remaining()-1}{RESET} {BOLD}sections left"
|
||||||
+ RESET,
|
+ RESET,
|
||||||
|
end="",
|
||||||
)
|
)
|
||||||
else:
|
if toexport > 1:
|
||||||
print()
|
print(
|
||||||
|
f" {BOLD}Export limit is in {R}{toexport}{RESET} articles{RESET}",
|
||||||
|
end="",
|
||||||
|
)
|
||||||
|
print()
|
||||||
# Define the section’s path (directory) & create directory(ies) if needed
|
# Define the section’s path (directory) & create directory(ies) if needed
|
||||||
sectiondir: str = config.output_dir + "/" + section.get_slug()
|
sectiondir: str = config.output_dir + "/" + section.get_slug()
|
||||||
makedirs(sectiondir, exist_ok=True)
|
makedirs(sectiondir, exist_ok=True)
|
||||||
@ -57,21 +66,19 @@ if __name__ == "__main__": # Following is executed only if script is directly e
|
|||||||
with open(sectionpath, "w") as f:
|
with open(sectionpath, "w") as f:
|
||||||
f.write(section.get_content())
|
f.write(section.get_content())
|
||||||
# Loop over section’s articles
|
# Loop over section’s articles
|
||||||
articles = section.get_articles(maxexport)
|
|
||||||
maxexport -= len(articles)
|
|
||||||
for article, counter in articles:
|
for article, counter in articles:
|
||||||
# Print the remaining number of articles to export every 100 articles
|
# Print the remaining number of articles to export every 100 articles
|
||||||
if counter.count % 100 == 0:
|
if counter.count % 100 == 0:
|
||||||
s: str = "s" if counter.remaining() > 1 else ""
|
s: str = "s" if counter.remaining() > 1 else ""
|
||||||
print(
|
print(
|
||||||
f" {BOLD}Exporting {R}{counter.remaining()}{RESET}"
|
f" {BOLD}Exporting {G}{counter.remaining()}{RESET}"
|
||||||
+ f"{BOLD} SPIP article{s}{RESET} to Markdown & YAML files"
|
+ f"{BOLD} SPIP article{s}{RESET} to Markdown & YAML files"
|
||||||
)
|
)
|
||||||
# Print the title of the article being exported
|
# Print the title of the article being exported
|
||||||
print(
|
print(
|
||||||
f" {BOLD}{counter.count + 1}. "
|
f" {BOLD}{counter.count + 1}. "
|
||||||
+ ("EMPTY " if len(article.text) < 1 else "")
|
+ ("EMPTY " if len(article.text) < 1 else "")
|
||||||
+ RESET
|
+ f"{article.lang} {RESET}"
|
||||||
+ highlight_unknown_chars(article.title, R, RESET)
|
+ highlight_unknown_chars(article.title, R, RESET)
|
||||||
)
|
)
|
||||||
# Define the full article path & create directory(ies) if needed
|
# Define the full article path & create directory(ies) if needed
|
||||||
@ -81,19 +88,22 @@ if __name__ == "__main__": # Following is executed only if script is directly e
|
|||||||
articlepath: str = articledir + "/" + article.get_filename()
|
articlepath: str = articledir + "/" + article.get_filename()
|
||||||
with open(articlepath, "w") as f:
|
with open(articlepath, "w") as f:
|
||||||
f.write(article.get_content())
|
f.write(article.get_content())
|
||||||
# Store detected unknown characters
|
# Store articles with unknown characters
|
||||||
if len(article.get_unknown_chars()) > 0:
|
print(f"UNKNOWN CHARS {get_unknown_chars(article.text)}")
|
||||||
|
if len(get_unknown_chars(article.text)) > 0:
|
||||||
unknown_chars_articles.append(article)
|
unknown_chars_articles.append(article)
|
||||||
# Print the outputted file’s path when finished exporting the article
|
# Print the outputted file’s path when finished exporting the article
|
||||||
print(f" {BOLD}Article>{RESET} {articlepath}")
|
print(f" {BOLD}{G}-->{RESET} {articlepath}")
|
||||||
# Print the outputted file’s path when finished exporting the section
|
# Print the outputted file’s path when finished exporting the section
|
||||||
print(f"{BOLD}Section>{RESET} {sectionpath}\n")
|
print(f"{BOLD}{B}-->{RESET} {sectionpath}\n")
|
||||||
|
# Decrement export limit with length of exported section
|
||||||
|
toexport -= len(articles)
|
||||||
|
|
||||||
# Loop through each article that contains an unknown character
|
# Loop through each article that contains an unknown character
|
||||||
for article in unknown_chars_articles:
|
for article in unknown_chars_articles:
|
||||||
# Print the title of the article in which there is unknown characters
|
# Print the title of the article in which there is unknown characters
|
||||||
# & the number of them
|
# & the number of them
|
||||||
unknown_chars_apparitions: list[str] = article.get_unknown_chars()
|
unknown_chars_apparitions: list[str] = get_unknown_chars(article.text)
|
||||||
nb: int = len(unknown_chars_apparitions)
|
nb: int = len(unknown_chars_apparitions)
|
||||||
s: str = "s" if nb > 1 else ""
|
s: str = "s" if nb > 1 else ""
|
||||||
print(
|
print(
|
||||||
|
Loading…
Reference in New Issue
Block a user