start big refactor to structure properly main.py, in order to export sections documents the same way than articles

This commit is contained in:
Guilhem Fauré 2023-05-23 15:32:53 +02:00
parent fdd25f3de6
commit bf6b8d4fe5

View File

@ -8,7 +8,15 @@ from sys import argv
from config import config from config import config
from converter import get_unknown_chars, unknown_chars from converter import get_unknown_chars, unknown_chars
from database import db from database import db
from items import Article, Sections from items import (
Article,
Articles,
Document,
Documents,
LimitCounter,
Section,
Sections,
)
# Print a stylized string, without trailing newline # Print a stylized string, without trailing newline
@ -51,129 +59,157 @@ def highlight(string: str, *start_stop: tuple[int, int]) -> None:
db.init(config.db, host=config.db_host, user=config.db_user, password=config.db_pass) db.init(config.db, host=config.db_host, user=config.db_user, password=config.db_pass)
db.connect() db.connect()
if __name__ == "__main__": # Only if script is directly executed
# Output information about ongoing export & write section to output destination
def write_section(
section: Section, counter: LimitCounter
) -> tuple[Articles, Documents, str]:
# Print the name of the exported section & number of remaining sections
style(f"{counter.count + 1}. ", BO)
highlight(section.title, *unknown_chars(section.title))
if counter.remaining() > 2:
style(f" {counter.remaining()-1}", BO, G)
style(" sections")
print(" left to export", end="")
if toexport > 1:
style(f" {toexport}", BO, Y)
style(" articles")
print(" left before export limit", end="")
print()
# Define the sections path (directory) & create directory(ies) if needed
sectiondir: str = config.output_dir + "/" + section.get_slug()
makedirs(sectiondir, exist_ok=True)
# Define the section filename & write the index at that filename
sectionpath: str = sectiondir + "/" + section.get_filename()
with open(sectionpath, "w") as f:
f.write(section.get_content())
# Return the first "limit" articles of section
return (section.get_articles(), section.get_documents(), sectiondir)
# Output information about ongoing export & write article to output destination
def write_article(
article: Article, counter: LimitCounter, sectiondir: str
) -> tuple[Documents, str]:
# Print the remaining number of articles to export every 100 articles
if counter.count % 100 == 0:
s: str = "s" if counter.remaining() > 1 else ""
print(" Exporting", end="")
style(f" {counter.remaining()}", BO, Y)
print(" SPIP", end="")
style(f" article{s}")
print(" to Markdown & YAML files")
# Print the title of the article being exported
style(
f" {counter.count + 1}. "
+ ("EMPTY " if len(article.text) < 1 else "")
+ f"{article.lang} "
)
highlight(article.title, *unknown_chars(article.title))
print()
# Define the full article path & create directory(ies) if needed
articledir: str = sectiondir + "/" + article.get_slug()
makedirs(articledir, exist_ok=True)
# Define the article filename & write the article at the filename
articlepath: str = articledir + "/" + article.get_filename()
with open(articlepath, "w") as f:
f.write(article.get_content())
# Store articles with unknown characters
if len(get_unknown_chars(article.text)) > 0:
unknown_chars_articles.append(article)
return (article.get_documents(), articledir)
# Output information about ongoing export & copy document to output destination
def write_document(document: Document, counter: LimitCounter, objectdir: str) -> None:
if counter.count % 100 == 0:
s: str = "s" if counter.remaining() > 1 else ""
print(" Exporting", end="")
style(f" {counter.remaining()}", BO, B)
style(f" document{s}")
print(" in this article")
# Print the name of the file with a counter
style(f" {counter.count + 1}. {document.media} ")
if len(document.title) > 0:
highlight(document.title + " ", *unknown_chars(document.title))
style("at ")
print(document.file)
# Define document path
documentpath: str = expanduser(config.data_dir + "/" + document.file)
# Copy the document from its SPIP location to the new location
try:
copyfile(documentpath, objectdir + "/" + document.get_slug())
except FileNotFoundError:
style(" NOT FOUND: ", BO, R)
print(documentpath)
else:
# Print the outputted files path when copied the file
style(" -->", BO, B)
print(f" {objectdir}/{document.get_slug()}")
# Return true if an article field contains an unknown character
def has_unknown_chars(article: Article) -> bool:
return True
# Print the detected unknown chars in article in their context but highlighted
def warn_unknown_chars(article: Article) -> None:
# Print the title of the article in which there is unknown characters
# & the number of them
unknown_chars_apparitions: list[str] = get_unknown_chars(article.text)
nb: int = len(unknown_chars_apparitions)
s: str = "s" if nb > 1 else ""
style(f"{nb}")
print(f" unknown character{s} in", end="")
style(f" {article.lang} ")
highlight(article.title, *unknown_chars(article.title))
print() # Break line
# Print the context in which the unknown characters are found
for text in unknown_chars_apparitions:
style("")
highlight(text, *unknown_chars(text))
style("\n")
print() # Break line
# Main loop to execute only if script is directly executed
if __name__ == "__main__":
# Define max nb of articles to export based on first CLI argument # Define max nb of articles to export based on first CLI argument
if len(argv) >= 2: if len(argv) >= 2:
toexport = int(argv[1]) toexport = int(argv[1])
else: else:
toexport = config.default_export_max toexport = config.default_export_max
# Clear the output dir & create a new
if config.clear_output: if config.clear_output:
# Clear the output dir & create a new
rmtree(config.output_dir, True) rmtree(config.output_dir, True)
makedirs(config.output_dir, exist_ok=True) makedirs(config.output_dir, exist_ok=True)
# Articles that contains unknown chars # Make a list containing articles where unknown characters are detected
unknown_chars_articles: list[Article] = [] unknown_chars_articles: list[Article] = []
# Loop among first maxexport articles & export them # Loop among first maxexport articles & export them
for section, counter in Sections(): for section, counter in Sections(toexport):
# Define articles of the sections, limited by toexport # Write the section & store its articles
if toexport <= 0: articles, documents, sectiondir = write_section(section, counter)
break # Loop over sections related files (images …)
articles = section.get_articles(toexport) for document, counter in documents:
# Print the name of the exported section & number of remaining sections write_document(document, counter, sectiondir)
style(f"{counter.count + 1}. ", BO)
highlight(section.title, *unknown_chars(section.title))
if counter.remaining() > 2:
style(f" {counter.remaining()-1}", BO, G)
style(" sections")
print(" left to export", end="")
if toexport > 1:
style(f" {toexport}", BO, Y)
style(" articles")
print(" left before export limit", end="")
print()
# Define the sections path (directory) & create directory(ies) if needed
sectiondir: str = config.output_dir + "/" + section.get_slug()
makedirs(sectiondir, exist_ok=True)
# Define the section filename & write the index at that filename
sectionpath: str = sectiondir + "/" + section.get_filename()
with open(sectionpath, "w") as f:
f.write(section.get_content())
# Loop over sections articles # Loop over sections articles
for article, counter in articles: for article, counter in articles:
# Print the remaining number of articles to export every 100 articles documents, articledir = write_article(article, counter, sectiondir)
if counter.count % 100 == 0: # Add article to unknown_chars_articles if needed
s: str = "s" if counter.remaining() > 1 else "" if has_unknown_chars(article):
print(" Exporting", end="")
style(f" {counter.remaining()}", BO, Y)
print(" SPIP", end="")
style(f" article{s}")
print(" to Markdown & YAML files")
# Print the title of the article being exported
style(
f" {counter.count + 1}. "
+ ("EMPTY " if len(article.text) < 1 else "")
+ f"{article.lang} "
)
highlight(article.title, *unknown_chars(article.title))
print()
# Define the full article path & create directory(ies) if needed
articledir: str = sectiondir + "/" + article.get_slug()
makedirs(articledir, exist_ok=True)
# Define the article filename & write the article at the filename
articlepath: str = articledir + "/" + article.get_filename()
with open(articlepath, "w") as f:
f.write(article.get_content())
# Store articles with unknown characters
if len(get_unknown_chars(article.text)) > 0:
unknown_chars_articles.append(article) unknown_chars_articles.append(article)
# Loop over articles related files (images …) # Loop over articles related files (images …)
for document, counter in article.get_documents(): for document, counter in documents:
if counter.count % 100 == 0: write_document(document, counter, articledir)
s: str = "s" if counter.remaining() > 1 else "" # Break 2 lines when finished exporting the section
print(" Exporting", end="") print("\n")
style(f" {counter.remaining()}", BO, B)
style(f" document{s}")
print(" in this article")
# Print the name of the file with a counter
style(f" {counter.count + 1}. {document.media} ")
if len(document.title) > 0:
highlight(document.title + " ", *unknown_chars(document.title))
style("at ")
print(document.file)
# Define document path
documentpath: str = expanduser(config.data_dir + "/" + document.file)
# Copy the document from its SPIP location to the new location
try:
copyfile(documentpath, articledir + "/" + document.get_slug())
except FileNotFoundError:
style(" NOT FOUND: ", BO, R)
print(documentpath)
else:
# Print the outputted files path when copied the file
style(" -->", BO, B)
print(f" {articledir}/{document.get_slug()}")
# Print the outputted files path when finished exporting the article
style(" --> ", BO, Y)
print(articlepath)
# Print the outputted files path when finished exporting the section
style("--> ", BO, G)
print(sectionpath)
print()
# Decrement export limit with length of exported section
toexport -= len(articles)
print() # Break line
# Loop through each article that contains an unknown character # Loop through each article that contains an unknown character
for article in unknown_chars_articles: for article in unknown_chars_articles:
# Print the title of the article in which there is unknown characters warn_unknown_chars(article)
# & the number of them
unknown_chars_apparitions: list[str] = get_unknown_chars(article.text)
nb: int = len(unknown_chars_apparitions)
s: str = "s" if nb > 1 else ""
style(f"{nb}")
print(f" unknown character{s} in", end="")
style(f" {article.lang} ")
highlight(article.title, *unknown_chars(article.title))
print() # Break line
# Print the context in which the unknown characters are found
for text in unknown_chars_apparitions:
style("")
highlight(text, *unknown_chars(text))
style("\n")
print() # Break line
db.close() # Close the connection with the database db.close() # Close the connection with the database