Tighten lexing of T_TEXT nodes.

Thanks to some heavy rubberducking with @whitequark the lexer is now a little
bit better at lexing T_TEXT nodes. For example, previously the following could
not be lexed properly:

    "foo < bar"

There might still be some tweaking to do but we're getting there.
This commit is contained in:
Yorick Peterse 2014-09-03 00:51:13 +02:00
parent 145315c26a
commit 49ddebf358
1 changed files with 52 additions and 16 deletions

View File

@ -37,7 +37,8 @@
newline = '\n' | '\r\n'; newline = '\n' | '\r\n';
whitespace = [ \t]; whitespace = [ \t];
identifier = [a-zA-Z0-9\-_]+; ident_char = [a-zA-Z0-9\-_];
identifier = ident_char+;
# Comments # Comments
# #
@ -209,13 +210,19 @@
# body of an element is lexed using the `main` machine. # body of an element is lexed using the `main` machine.
# #
element_start = '<' ident_char;
element_end = '</' identifier (':' identifier)* '>'; element_end = '</' identifier (':' identifier)* '>';
action start_element { action start_element {
callback_simple("on_element_start"); callback_simple("on_element_start");
fhold;
fnext element_name; fnext element_name;
} }
action close_element {
callback_simple("on_element_end");
}
# Machine used for lexing the name/namespace of an element. # Machine used for lexing the name/namespace of an element.
element_name := |* element_name := |*
identifier ':' => { identifier ':' => {
@ -262,6 +269,46 @@
}; };
*|; *|;
# Text
#
# http://www.w3.org/TR/xml/#syntax
# http://www.w3.org/TR/html-markup/syntax.html#text-syntax
#
# Text content is everything leading up to certain special tags such as "</"
# and "<?".
action start_text {
fhold;
fnext text;
}
# These characters terminate a T_TEXT sequence and instruct Ragel to jump
# back to the main machine.
#
# Note that this only works if each sequence is exactly 2 characters
# long. Because of this "<!" is used instead of "<!--".
terminate_text = '</' | '<!' | '<?' | element_start;
allowed_text = any* -- terminate_text;
text := |*
# Text followed by a special tag, such as "foo<!--"
allowed_text @{ mark = p; } terminate_text => {
callback("on_text", data, encoding, ts, mark);
p = mark - 1;
mark = 0;
fnext main;
};
# Just regular text.
allowed_text => {
callback("on_text", data, encoding, ts, te);
fnext main;
};
*|;
# The main machine aka the entry point of Ragel. # The main machine aka the entry point of Ragel.
main := |* main := |*
doctype_start => start_doctype; doctype_start => start_doctype;
@ -269,19 +316,8 @@
comment => start_comment; comment => start_comment;
cdata => start_cdata; cdata => start_cdata;
proc_ins_start => start_proc_ins; proc_ins_start => start_proc_ins;
element_start => start_element;
# The start of an element. element_end => close_element;
'<' => start_element; any => start_text;
# Regular closing tags.
element_end => {
callback_simple("on_element_end");
};
# Treat everything else, except for "<", as regular text. The "<" sign
# is used for tags so we can't emit text nodes for these characters.
any+ -- '<' => {
callback("on_text", data, encoding, ts, te);
};
*|; *|;
}%% }%%