add option to remove markup of metadata fields

This commit is contained in:
Guilhem Fauré 2023-06-16 16:50:05 +02:00
parent e1cd926078
commit fd56e86dda
3 changed files with 18 additions and 9 deletions

View File

@ -100,6 +100,7 @@ ignore_patterns: [] # List of regexes: Matching sections or articles will be
# Text body processing settings
remove_html: true # Should we clean remaining HTML blocks
metadata_markup: true # Should we keep markup (Markdown) in metadata fields (like title)
unknown_char_replacement: ?? # String to replace broken encoding that cannot be repaired
# Settings you probably dont want to modify

View File

@ -71,6 +71,7 @@ class Configuration:
export_drafts: bool = True # Should we export drafts as draft:true articles
export_empty: bool = True # Should we export empty articles
remove_html: bool = True # Should spip2md remove every HTML tags
metadata_markup: bool = True # Should spip2md keep the markup in metadata fields
title_max_length: int = 40 # Maximum length of a single title for directory names
unknown_char_replacement: str = "??" # Replaces unknown characters
clear_log: bool = True # Clear log before every run instead of appending to

View File

@ -17,6 +17,7 @@ import logging
from os import listdir, mkdir
from os.path import basename, isfile, splitext
from re import I, Match, Pattern, finditer, match, search
from re import error as re_error
from shutil import copyfile
from typing import Any, Optional
@ -82,14 +83,20 @@ class SpipWritable:
# Apply a mapping from regex maps
@staticmethod
def apply_mapping(text: str, mapping: tuple) -> str:
def apply_mapping(text: str, mapping: tuple, keep_markup: bool = True) -> str:
if type(mapping) == tuple and len(mapping) > 0:
if type(mapping[0]) == tuple and len(mapping[0]) > 0:
if type(mapping[0][0]) == Pattern:
if type(mapping[0][0]) == Pattern: # Mostly for syntax conversion
for old, new in mapping:
if keep_markup:
text = old.sub(new, text)
else:
for old, new in mapping:
try:
text = old.sub(r"\1", text)
except re_error:
text = old.sub("", text)
else:
for old, new in mapping: # Mostly for broken encoding
text = text.replace(old, new)
elif type(mapping[0]) == Pattern:
for old in mapping:
@ -129,18 +136,18 @@ class SpipWritable:
return text
# Apply needed methods on text fields
def convert_field(self, field: Optional[str], clean_html: bool = True) -> str:
def convert_field(self, field: Optional[str], keep_markup: bool = True) -> str:
if field is None:
return ""
if len(field) == 0:
return ""
# Convert SPIP syntax to Markdown
field = self.apply_mapping(field, SPIP_MARKDOWN)
field = self.apply_mapping(field, SPIP_MARKDOWN, keep_markup)
# Remove useless text
field = self.apply_mapping(field, BLOAT)
# Convert broken ISO encoding to UTF
field = self.apply_mapping(field, ISO_UTF)
if clean_html:
if CFG.remove_html:
# Delete remaining HTML tags in body WARNING
field = self.apply_mapping(field, HTMLTAGS)
# Warn about unknown chars
@ -510,7 +517,7 @@ class SpipRedactional(SpipWritable):
self._url_title = self.replace_links(self._url_title)
LOG.debug(f"Apply conversions to {self.lang} `{self._url_title}` title")
self._storage_title = self.convert_field(self._storage_title)
self._url_title = self.convert_field(self._url_title)
self._url_title = self.convert_field(self._url_title, CFG.metadata_markup)
for p in CFG.ignore_patterns:
for title in (self._storage_title, self._url_title):
m = match(p, title, I)
@ -554,7 +561,7 @@ class SpipRedactional(SpipWritable):
LOG.debug(f"Convert internal links of {self.lang} `{self._url_title}` extra")
self._extra = self.replace_links(self._extra)
LOG.debug(f"Apply conversions to {self.lang} `{self._url_title}` extra")
self._extra = self.convert_field(self._extra)
self._extra = self.convert_field(self._extra, CFG.metadata_markup)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)