coloring big refactor + init copy of assets

This commit is contained in:
Guilhem Fauré 2023-05-22 16:48:47 +02:00
parent 95ccc5fab5
commit ae7063e241
3 changed files with 89 additions and 49 deletions

View File

@ -275,6 +275,7 @@ unknown_iso = (
)
# Apply spip_to_markdown conversions to a text
def convert_body(text: Optional[str]) -> str:
if text is None:
return ""
@ -285,6 +286,7 @@ def convert_body(text: Optional[str]) -> str:
return text
# Apply spip_to_text conversions to a text
def convert_meta(text: Optional[str]) -> str:
if text is None:
return ""
@ -295,22 +297,23 @@ def convert_meta(text: Optional[str]) -> str:
return text
# Replace unknown chars with empty strings (delete them)
def remove_unknown_chars(text: str) -> str:
for char in unknown_iso:
text.replace(char, "")
return text
def highlight_unknown_chars(text: str, pre: str, post: str) -> str:
# Add pre before unknown char and post after unknown char
# Return a list of tuples giving the start and end of unknown substring in text
def unknown_chars(text: str) -> list[tuple[int, int]]:
positions: list[tuple[int, int]] = []
for char in unknown_iso:
for match in finditer("(" + char + ")+", text):
text = (
text[: match.start()] + pre + match.group() + post + text[match.end() :]
)
return text
positions.append((match.start(), match.end()))
return positions
# Return strings with unknown chards found in text, surrounded by context_length chars
def get_unknown_chars(text: str, context_length: int = 20) -> list[str]:
errors: list[str] = []
context: str = r".{0," + str(context_length) + r"}"

View File

@ -166,6 +166,7 @@ class Document:
self.creation: str = document.date
self.publication: str = document.date_publication
self.update: str = document.maj
self.media: str = document.media
def get_slug(self, date: bool = False) -> str:
return slugify((self.publication + "-" if date else "") + self.title)

View File

@ -1,26 +1,56 @@
#!python
# pyright: strict
from os import makedirs, mkdir
from shutil import rmtree
from shutil import copyfile, rmtree
from sys import argv
from config import config
from converter import get_unknown_chars, highlight_unknown_chars
from converter import get_unknown_chars, unknown_chars
from database import db
from items import Article, Sections
from items import Article, Documents, Sections
# Print a stylized string, without trailing newline
def style(string: str, *args: int) -> None:
esc = "\033[" # Terminal escape sequence, needs to be closed by "m"
if len(args) == 0:
params: str = "1;" # Defaults to bold
else:
params: str = ""
for a in args:
params += str(a) + ";"
print(esc + params[:-1] + "m" + string + esc + "0m", end="")
# Define styles
BO = 1 # Bold
IT = 3 # Italic
UN = 4 # Underline
# Define colors
R = 91 # Red
G = 92 # Green
Y = 93 # Yellow
B = 94 # Blue
C0 = 95 # Color
C1 = 96 # Color
C2 = 96 # Color
# Print a string, highlighting every substring starting at start_stop[x][0] …
def highlight(string: str, *start_stop: tuple[int, int]) -> None:
previous_stop = 0
for start, stop in start_stop:
print(string[previous_stop:start], end="")
style(string[start:stop], BO, R)
previous_stop = stop
print(string[previous_stop:], end="")
# Define terminal escape sequences to stylize output
R: str = "\033[91m"
G: str = "\033[92m"
B: str = "\033[94m"
BOLD: str = "\033[1m"
RESET: str = "\033[0m"
# Connect to the MySQL database with Peewee ORM
db.init(config.db, host=config.db_host, user=config.db_user, password=config.db_pass)
db.connect()
if __name__ == "__main__": # Following is executed only if script is directly executed
if __name__ == "__main__": # Only if script is directly executed
# Define max nb of articles to export based on first CLI argument
if len(argv) >= 2:
toexport = int(argv[1])
@ -41,22 +71,16 @@ if __name__ == "__main__": # Following is executed only if script is directly e
break
articles = section.get_articles(toexport)
# Print the name of the exported section & number of remaining sections
print(
f"{BOLD}{counter.count + 1}. {RESET}"
+ highlight_unknown_chars(section.title, R, RESET),
end="",
)
style(f"{counter.count + 1}. ", BO)
highlight(section.title, *unknown_chars(section.title))
if counter.remaining() > 2:
print(
f" {BOLD}{B}{counter.remaining()-1}{RESET} {BOLD}sections left"
+ RESET,
end="",
)
style(f" {counter.remaining()-1}", BO, G)
style(" sections")
print(" left to export", end="")
if toexport > 1:
print(
f" {BOLD}Export limit is in {R}{toexport}{RESET} articles{RESET}",
end="",
)
style(f" {toexport}", BO, Y)
style(" articles")
print(" left before export limit", end="")
print()
# Define the sections path (directory) & create directory(ies) if needed
sectiondir: str = config.output_dir + "/" + section.get_slug()
@ -70,17 +94,18 @@ if __name__ == "__main__": # Following is executed only if script is directly e
# Print the remaining number of articles to export every 100 articles
if counter.count % 100 == 0:
s: str = "s" if counter.remaining() > 1 else ""
print(
f" {BOLD}Exporting {G}{counter.remaining()}{RESET}"
+ f"{BOLD} SPIP article{s}{RESET} to Markdown & YAML files"
)
print(" Exporting", end="")
style(f" {counter.remaining()}", BO, Y)
print(" SPIP", end="")
style(f" article{s}")
print(" to Markdown & YAML files")
# Print the title of the article being exported
print(
f" {BOLD}{counter.count + 1}. "
style(
f" {counter.count + 1}. "
+ ("EMPTY " if len(article.text) < 1 else "")
+ f"{article.lang} {RESET}"
+ highlight_unknown_chars(article.title, R, RESET)
+ f"{article.lang} "
)
highlight(article.title, *unknown_chars(article.title))
# Define the full article path & create directory(ies) if needed
articledir: str = sectiondir + "/" + article.get_slug()
makedirs(articledir, exist_ok=True)
@ -91,10 +116,23 @@ if __name__ == "__main__": # Following is executed only if script is directly e
# Store articles with unknown characters
if len(get_unknown_chars(article.text)) > 0:
unknown_chars_articles.append(article)
# Loop over articles related files (images …)
for document, counter in Documents(article.id):
# Print the name of the file with a counter
style(f" {counter.count + 1}. {document.media} ")
highlight(article.title, *unknown_chars(article.title))
# Copy the document from its SPIP location to the new location
copyfile(config.data_dir + "/" + document.file, document.get_slug())
# Print the outputted files path when copied the file
style(" -->", BO, B)
print(f" {articledir}/{document.get_slug()}")
# Print the outputted files path when finished exporting the article
print(f" {BOLD}{G}-->{RESET} {articlepath}")
style(" --> ", BO, Y)
print(articlepath)
# Print the outputted files path when finished exporting the section
print(f"{BOLD}{B}-->{RESET} {sectionpath}\n")
style(" --> ", BO, G)
print(sectionpath)
print()
# Decrement export limit with length of exported section
toexport -= len(articles)
@ -105,16 +143,14 @@ if __name__ == "__main__": # Following is executed only if script is directly e
unknown_chars_apparitions: list[str] = get_unknown_chars(article.text)
nb: int = len(unknown_chars_apparitions)
s: str = "s" if nb > 1 else ""
print(
f"\n{BOLD}{nb}{RESET} unknown character{s} in {BOLD}{article.lang}{RESET} "
+ highlight_unknown_chars(article.title, R, RESET)
)
style(f"\n{nb}")
print(f" unknown character{s} in")
style(f" {article.lang} ")
highlight(article.title, *unknown_chars(article.title))
# Print the context in which the unknown characters are found
for text in unknown_chars_apparitions:
print(
f" {BOLD}{RESET} "
+ highlight_unknown_chars(text, R, RESET)
+ f" {BOLD}{RESET}"
)
style("")
highlight(text, *unknown_chars(text))
style("")
db.close() # Close the connection with the database