switch to new grammar

This commit is contained in:
Guilhem Fauré 2023-04-28 15:44:50 +02:00
parent 1e59bb6833
commit 28ab3123df
3 changed files with 55 additions and 152 deletions

View File

@ -2,8 +2,7 @@ from os import path
from lark import Lark
# spipParser = Lark(open(path.dirname(__file__) + "/spip.lark"))
spipParser = Lark(open(path.dirname(__file__) + "/spip.flex.lark"))
spipParser = Lark(open(path.dirname(__file__) + "/spip.lark"))
class content:

View File

@ -1,75 +0,0 @@
// Flexible SPIP Markup grammar for Lark parser
start: _N* block ( _N+ block )* _N*
?block: paragraph
| heading
| list
| table
| tag
| HORIZONTAL_RULE -> horizontal_rule
HORIZONTAL_RULE: /----+/
?list: unordered_list
| ordered_list
unordered_list: ( _UL list_item _N )+
ordered_list: ( _OL list_item _N )+
list_item: _inline{TEXT}+
_UL: /-\*|-[^#-]/
_OL: /-#/
table: ( _TBL_META table_metadata "||" _N )? ( table_row _N )+
table_metadata: table_title "|" table_description
table_title: _inline{TABLE_TEXT}
table_description: _inline{TABLE_TEXT}
table_row: ( _TBL table_cell )+ "|"
table_cell: _inline{TABLE_TEXT}
_TBL_META: "||"
_TBL: "|"
heading: _H _inline{MARKED_TEXT}+ "}}}"
_H: "{{{"
paragraph: ( _inline{TEXT} _N? )+
_inline{text}: text
| emphasis
| strong
| anchor
| tag
TEXT: /(?:[^\r\n\{](?![^\[\n\r]*->))+/
TABLE_TEXT: /[^\|\r\n\{]+/
MARKED_TEXT: /[^\}\r\n\{]+/
strong: _B ( _inline{MARKED_TEXT} )+ ( "}}" | _N )
emphasis: _I ( _inline{MARKED_TEXT} )+ ( "}" | _N )
_B: /{{(?=[^\{])/
_I: /{(?=[^\{])/
?anchor: anchor_footnote
| anchor_wikipedia
| anchor_normal -> anchor
anchor_footnote: _FOOT HREF "]]"
anchor_wikipedia: _WIKI HREF "]"
anchor_normal: _A A_TEXT "->" HREF "]"
_FOOT: /\[\[/
_WIKI: /\[\?/
_A: /\[(?=[^\[\n\r]+->)/
HREF: _PURE_TEXT
A_TEXT: /[^\r\n\{]+?(?=->)/
tag: end_tag
| start_tag
end_tag: _ETAG TAG_NAME ( "|" TAG_OPTION )* ">"
start_tag: _STAG TAG_NAME ( "|" TAG_OPTION )* ">"
_STAG: "<"
_ETAG: "</"
TAG_NAME: _PURE_TEXT
TAG_OPTION: _PURE_TEXT
_N: /\r?\n/
_PURE_TEXT: /[0-9A-Za-z_:\/\-\.]+/

View File

@ -1,96 +1,75 @@
// SPIP Markup grammar for Lark parser
start: _N* block ( _N+ block )+ _N*
// Flexible SPIP Markup grammar for Lark parser
start: _N* block ( _N+ block )* _N*
?block: paragraph
| heading
| list
| table
| orphan_tag
| SEPARATOR -> hr
| tag
| HORIZONTAL_RULE -> horizontal_rule
HORIZONTAL_RULE: /----+/
?list: unordered_list
| ordered_list
unordered_list: ( _HYPHEN_STAR list_element _N )+ -> ul
ordered_list: ( _HYPHEN_HASH list_element _N )+ -> ol
list_element: _inline_format -> li
unordered_list: ( _UL list_item _N )+
ordered_list: ( _OL list_item _N )+
list_item: _inline{TEXT}+
_UL: /-\*|-[^#-]/
_OL: /-#/
table: ( table_meta _N )? ( row _N )+ -> table
table_meta: _PIPE _PIPE TEXT _PIPE TEXT _PIPE _PIPE -> title_description
row: ( _PIPE cell )+ _PIPE -> tr
cell: _inline_format -> td
table: ( _TBL_META table_metadata "||" _N )? ( table_row _N )+
table_metadata: table_title "|" table_description
table_title: _inline{TABLE_TEXT}
table_description: _inline{TABLE_TEXT}
table_row: ( _TBL table_cell )+ "|"
table_cell: _inline{TABLE_TEXT}
_TBL_META: "||"
_TBL: "|"
heading: _O_CURLY_3 ( TEXT | link | nested_italic | nested_bold ) _C_CURLY_3 -> h2
heading: _H _inline{MARKED_TEXT}+ "}}}"
_H: "{{{"
paragraph: ( _inline_format _N? )+ -> p
paragraph: ( _inline{TEXT} _N? )+
_inline_format: TEXT
| italic
| bold
| link
| orphan_tag
| orphan_quote
| tag
_inline{text}: text
| emphasis
| strong
| anchor
| tag
bold: _O_CURLY_2 ( TEXT | link | nested_italic )+ _C_CURLY_2 -> strong
italic: _O_CURLY ( TEXT | link | nested_bold )+ _C_CURLY -> em
nested_bold: TEXT+ _O_CURLY_2 ( TEXT | link )+ _C_CURLY_2 -> strong
nested_italic: TEXT+ _O_CURLY ( TEXT | link )+ _C_CURLY -> em
TEXT: /(?:[^\r\n\{](?![^\[\n\r]*->))+/
TABLE_TEXT: /[^\|\r\n\{]+/
MARKED_TEXT: /[^\}\r\n\{]+/
?link: footnote
| wikipedia_link
| a
strong: _B ( _inline{MARKED_TEXT} )+ ( "}}" | _N )
emphasis: _I ( _inline{MARKED_TEXT} )+ ( "}" | _N )
_B: /{{(?=[^\{])/
_I: /{(?=[^\{])/
footnote: _O_SQUARE_2 HREF _C_SQUARE_2 -> footnote
wikipedia_link: _O_SQUARE_INTERO HREF _C_SQUARE -> a_wikipedia
a: _LINK_OPENING LINK_TEXT _ARROW HREF _C_SQUARE -> a
?anchor: anchor_footnote
| anchor_wikipedia
| anchor_normal -> anchor
tag: closing_quote
| opening_quote
| closing_tag
| opening_tag
anchor_footnote: _FOOT HREF "]]"
anchor_wikipedia: _WIKI HREF "]"
anchor_normal: _A A_TEXT "->" HREF "]"
_FOOT: /\[\[/
_WIKI: /\[\?/
_A: /\[(?=[^\[\n\r]+->)/
HREF: _PURE_TEXT
A_TEXT: /[^\r\n\{]+?(?=->)/
orphan_quote: _ORPHAN_OPENING "quote" _C_ANGLE -> orphan_quote
orphan_tag: _ORPHAN_OPENING TEXT ( _PIPE TEXT )* _C_ANGLE -> orphan_tag
tag: end_tag
| start_tag
closing_quote: _O_ANGLE _SLASH "quote" _C_ANGLE -> closing_quote
opening_quote: _O_ANGLE "quote" _C_ANGLE -> opening_quote
closing_tag: _O_ANGLE _SLASH TEXT ( _PIPE TEXT )* _C_ANGLE -> closing_tag
opening_tag: _O_ANGLE TEXT ( _PIPE TEXT )* _C_ANGLE -> opening_tag
end_tag: _ETAG TAG_NAME ( "|" TAG_OPTION )* ">"
start_tag: _STAG TAG_NAME ( "|" TAG_OPTION )* ">"
_STAG: "<"
_ETAG: "</"
TAG_NAME: _PURE_TEXT
TAG_OPTION: _PURE_TEXT
// Terminals
_N: /\r?\n/
SEPARATOR: "----" "-"*
TEXT: ( PURE_TEXT | AMBIGUOUS_TEXT )+
_HYPHEN_STAR: "-*"
_HYPHEN_HASH: "-#"
_PIPE: "|"
_SLASH: "/"
_O_ANGLE: "<"
_C_ANGLE: ">"
_ORPHAN_OPENING: /<(?=([^>\/]+?)>)(?!.*<\/\1>)/
_O_CURLY_3: "{{{"
_C_CURLY_3: "}}}"
_O_CURLY_2: "{{"
_C_CURLY_2: "}}"
_O_CURLY: "{"
_C_CURLY: "}"
_O_SQUARE_2: "[["
_C_SQUARE_2: "]]"
_O_SQUARE_INTERO: "[?"
_C_SQUARE: "]"
_ARROW: "->"
_LINK_OPENING: /\[(?=/ PURE_TEXT+ /]*->)/
HREF: PURE_TEXT+
LINK_TEXT: PURE_TEXT+ /(?=->)/
/// Characters that could be markup but arent in this situation
AMBIGUOUS_TEXT: /\[(?!.*->.*\])/
| /\]/
| /\\}/
/// Every characters that have no markup meaning
PURE_TEXT: /[^\r\n\|\{\}\[\]\<\>]/
_PURE_TEXT: /[0-9A-Za-z_:\/\-\.]+/