Tighten lexing of T_TEXT nodes.
Thanks to some heavy rubberducking with @whitequark the lexer is now a little bit better at lexing T_TEXT nodes. For example, previously the following could not be lexed properly: "foo < bar" There might still be some tweaking to do but we're getting there.
This commit is contained in:
parent
145315c26a
commit
49ddebf358
|
@ -37,7 +37,8 @@
|
||||||
|
|
||||||
newline = '\n' | '\r\n';
|
newline = '\n' | '\r\n';
|
||||||
whitespace = [ \t];
|
whitespace = [ \t];
|
||||||
identifier = [a-zA-Z0-9\-_]+;
|
ident_char = [a-zA-Z0-9\-_];
|
||||||
|
identifier = ident_char+;
|
||||||
|
|
||||||
# Comments
|
# Comments
|
||||||
#
|
#
|
||||||
|
@ -209,13 +210,19 @@
|
||||||
# body of an element is lexed using the `main` machine.
|
# body of an element is lexed using the `main` machine.
|
||||||
#
|
#
|
||||||
|
|
||||||
element_end = '</' identifier (':' identifier)* '>';
|
element_start = '<' ident_char;
|
||||||
|
element_end = '</' identifier (':' identifier)* '>';
|
||||||
|
|
||||||
action start_element {
|
action start_element {
|
||||||
callback_simple("on_element_start");
|
callback_simple("on_element_start");
|
||||||
|
fhold;
|
||||||
fnext element_name;
|
fnext element_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
action close_element {
|
||||||
|
callback_simple("on_element_end");
|
||||||
|
}
|
||||||
|
|
||||||
# Machine used for lexing the name/namespace of an element.
|
# Machine used for lexing the name/namespace of an element.
|
||||||
element_name := |*
|
element_name := |*
|
||||||
identifier ':' => {
|
identifier ':' => {
|
||||||
|
@ -262,6 +269,46 @@
|
||||||
};
|
};
|
||||||
*|;
|
*|;
|
||||||
|
|
||||||
|
# Text
|
||||||
|
#
|
||||||
|
# http://www.w3.org/TR/xml/#syntax
|
||||||
|
# http://www.w3.org/TR/html-markup/syntax.html#text-syntax
|
||||||
|
#
|
||||||
|
# Text content is everything leading up to certain special tags such as "</"
|
||||||
|
# and "<?".
|
||||||
|
|
||||||
|
action start_text {
|
||||||
|
fhold;
|
||||||
|
fnext text;
|
||||||
|
}
|
||||||
|
|
||||||
|
# These characters terminate a T_TEXT sequence and instruct Ragel to jump
|
||||||
|
# back to the main machine.
|
||||||
|
#
|
||||||
|
# Note that this only works if each sequence is exactly 2 characters
|
||||||
|
# long. Because of this "<!" is used instead of "<!--".
|
||||||
|
|
||||||
|
terminate_text = '</' | '<!' | '<?' | element_start;
|
||||||
|
allowed_text = any* -- terminate_text;
|
||||||
|
|
||||||
|
text := |*
|
||||||
|
# Text followed by a special tag, such as "foo<!--"
|
||||||
|
allowed_text @{ mark = p; } terminate_text => {
|
||||||
|
callback("on_text", data, encoding, ts, mark);
|
||||||
|
|
||||||
|
p = mark - 1;
|
||||||
|
mark = 0;
|
||||||
|
|
||||||
|
fnext main;
|
||||||
|
};
|
||||||
|
|
||||||
|
# Just regular text.
|
||||||
|
allowed_text => {
|
||||||
|
callback("on_text", data, encoding, ts, te);
|
||||||
|
fnext main;
|
||||||
|
};
|
||||||
|
*|;
|
||||||
|
|
||||||
# The main machine aka the entry point of Ragel.
|
# The main machine aka the entry point of Ragel.
|
||||||
main := |*
|
main := |*
|
||||||
doctype_start => start_doctype;
|
doctype_start => start_doctype;
|
||||||
|
@ -269,19 +316,8 @@
|
||||||
comment => start_comment;
|
comment => start_comment;
|
||||||
cdata => start_cdata;
|
cdata => start_cdata;
|
||||||
proc_ins_start => start_proc_ins;
|
proc_ins_start => start_proc_ins;
|
||||||
|
element_start => start_element;
|
||||||
# The start of an element.
|
element_end => close_element;
|
||||||
'<' => start_element;
|
any => start_text;
|
||||||
|
|
||||||
# Regular closing tags.
|
|
||||||
element_end => {
|
|
||||||
callback_simple("on_element_end");
|
|
||||||
};
|
|
||||||
|
|
||||||
# Treat everything else, except for "<", as regular text. The "<" sign
|
|
||||||
# is used for tags so we can't emit text nodes for these characters.
|
|
||||||
any+ -- '<' => {
|
|
||||||
callback("on_text", data, encoding, ts, te);
|
|
||||||
};
|
|
||||||
*|;
|
*|;
|
||||||
}%%
|
}%%
|
||||||
|
|
Loading…
Reference in New Issue