first try more flexible grammar

This commit is contained in:
Guilhem Fauré 2023-04-28 12:04:30 +02:00
parent e2d5dd421d
commit 33f26c3ad5
2 changed files with 45 additions and 35 deletions

View File

@ -2,7 +2,8 @@ from os import path
from lark import Lark from lark import Lark
spipParser = Lark(open(path.dirname(__file__) + "/spip.lark")) # spipParser = Lark(open(path.dirname(__file__) + "/spip.lark"))
spipParser = Lark(open(path.dirname(__file__) + "/spip.flex.lark"))
class content: class content:

View File

@ -1,5 +1,4 @@
// SPIP Markup grammar for Lark parser, more flexible try // Flexible SPIP Markup grammar for Lark parser
start: _N* block ( _N+ block )+ _N* start: _N* block ( _N+ block )+ _N*
?block: paragraph ?block: paragraph
@ -12,73 +11,83 @@ start: _N* block ( _N+ block )+ _N*
?list: unordered_list ?list: unordered_list
| ordered_list | ordered_list
unordered_list: ( _HYPHEN list_element _N )+ unordered_list: ( _HYPHEN _STAR? list_item _N )+
ordered_list: ( _HYPHEN_HASH list_element _N )+ ordered_list: ( _HYPHEN_HASH list_item _N )+
list_item: _inline list_item: _inline
table: ( table_metadata _N )? ( row _N )+ table: ( _PIPE~2 table_title _PIPE table_description _PIPE~2 _N )? ( table_row _N )+
table_metadata: _PIPE _PIPE TEXT _PIPE TEXT _PIPE _PIPE table_title: _table_inline
table_row: ( _PIPE cell )+ _PIPE table_description: _table_inline
table_cell: _inline table_row: ( _PIPE table_cell )+ _PIPE
table_cell: _table_inline
heading: _O_CURLY_3 ( _inline /(?=/ _C_CURLY_3 /)/ ) _C_CURLY_3 heading: _O_CURLY_3 _markup_inline _C_CURLY_3
paragraph: ( _inline _N? )+ paragraph: ( _inline _N? )+
_inline: TEXT /(?=/ ( _O_CURLY | _O_SQUARE | _O_ANGLE ) /)/ _table_inline: TABLE_TEXT
| emphasis | emphasis
| strong | strong
| anchor | anchor
| tag | tag
strong: _O_CURLY_2 ( _inline /(?=/ _C_CURLY_2 /)/ )+ _C_CURLY_2 _inline: TEXT
emphasis: _O_CURLY ( _inline /(?=/ _C_CURLY /)/ )+ _C_CURLY | emphasis
| strong
| anchor
| tag
strong: _O_CURLY_2 ( _markup_inline )+ _C_CURLY_2
emphasis: _O_CURLY ( _markup_inline )+ _C_CURLY
?anchor: anchor_footnote ?anchor: anchor_footnote
| anchor_wikipedia | anchor_wikipedia
| anchor_normal -> anchor | anchor_normal -> anchor
footnote: _O_SQUARE_2 HREF _C_SQUARE_2 _markup_inline: MARKUP_TEXT
wikipedia_link: _O_SQUARE_INTERO HREF _C_SQUARE | emphasis
a: _LINK_OPENING LINK_TEXT _ARROW HREF _C_SQUARE | strong
| anchor
| tag
tag: closing_quote anchor_footnote: _O_SQUARE_2 HREF _C_SQUARE_2
| opening_quote anchor_wikipedia: _O_SQUARE_INTERO HREF _C_SQUARE
| closing_tag anchor_normal: _LINK_O_SQUARE ANCHOR_TEXT _ARROW HREF _C_SQUARE
| opening_tag
orphan_tag: _ORPHAN_OPENING TEXT ( _PIPE TEXT )* _C_ANGLE tag: end_tag
start_tag: _O_ANGLE _SLASH TEXT ( _PIPE TEXT )* _C_ANGLE | start_tag
end_tag: _O_ANGLE TEXT ( _PIPE TEXT )* _C_ANGLE
start_tag: _O_ANGLE _SLASH PURE_TEXT+ ( _PIPE PURE_TEXT+ )* _C_ANGLE
end_tag: _O_ANGLE PURE_TEXT+ ( _PIPE PURE_TEXT+ )* _C_ANGLE
// Terminals // Terminals
HORIZONTAL_RULE: /----+/
TABLE_TEXT: /.+?(?=[\{\[\|])/
TEXT: /.+?(?=[\{\[\<])/
MARKUP_TEXT: /.+?(?=[\{\}\[\]])/
PURE_TEXT: /[^\r\n\|\{\}\[\]\<\>]/
ANCHOR_TEXT: PURE_TEXT+ /(?=->)/
HREF: PURE_TEXT+
// Filtered terminals
_N: /\r?\n/ _N: /\r?\n/
HORIZONTAL_RULE: "----" "-"* _O_MARKUP: _O_CURLY | _O_ANGLE | _O_SQUARE | _HYPHEN | _PIPE
TEXT: /.+?/
_HYPHEN: "-" _HYPHEN: "-"
_STAR: "*"
_HYPHEN_HASH: "-#" _HYPHEN_HASH: "-#"
_PIPE: "|" _PIPE: "|"
_SLASH: "/" _SLASH: "/"
_O_ANGLE: "<" _O_ANGLE: "<"
_C_ANGLE: ">" _C_ANGLE: ">"
_ORPHAN_OPENING: /<(?=([^>\/]+?)>)(?!.*<\/\1>)/
_O_CURLY_3: "{{{" _O_CURLY_3: "{{{"
_C_CURLY_3: "}}}" _C_CURLY_3: "}}}"
_O_CURLY_2: "{{" _O_CURLY_2: "{{"
_C_CURLY_2: "}}" _C_CURLY_2: "}}"
_O_CURLY: "{" _O_CURLY: "{"
_C_CURLY: "}" _C_CURLY: "}"
_O_SQUARE_2: "[[" _O_SQUARE_2: "[["
_C_SQUARE_2: "]]" _C_SQUARE_2: "]]"
_O_SQUARE_INTERO: "[?" _O_SQUARE_INTERO: "[?"
_O_SQUARE: "["
_C_SQUARE: "]" _C_SQUARE: "]"
_ARROW: "->" _ARROW: "->"
_LINK_OPENING: /\[(?=/ PURE_TEXT+ /]*->)/ _LINK_O_SQUARE: _O_SQUARE /(?=/ PURE_TEXT+ /->)/
HREF: PURE_TEXT+
LINK_TEXT: PURE_TEXT+ /(?=->)/
/// Every characters that have no markup meaning
PURE_TEXT: /[^\r\n\|\{\}\[\]\<\>]/