From 49ddebf358e0e8af2f28de953350eefe2ceb7a68 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Wed, 3 Sep 2014 00:51:13 +0200 Subject: [PATCH] Tighten lexing of T_TEXT nodes. Thanks to some heavy rubberducking with @whitequark the lexer is now a little bit better at lexing T_TEXT nodes. For example, previously the following could not be lexed properly: "foo < bar" There might still be some tweaking to do but we're getting there. --- ext/ragel/base_lexer.rl | 68 +++++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index 336dd4f..21f75ad 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -37,7 +37,8 @@ newline = '\n' | '\r\n'; whitespace = [ \t]; - identifier = [a-zA-Z0-9\-_]+; + ident_char = [a-zA-Z0-9\-_]; + identifier = ident_char+; # Comments # @@ -209,13 +210,19 @@ # body of an element is lexed using the `main` machine. # - element_end = ''; + element_start = '<' ident_char; + element_end = ''; action start_element { callback_simple("on_element_start"); + fhold; fnext element_name; } + action close_element { + callback_simple("on_element_end"); + } + # Machine used for lexing the name/namespace of an element. element_name := |* identifier ':' => { @@ -262,6 +269,46 @@ }; *|; + # Text + # + # http://www.w3.org/TR/xml/#syntax + # http://www.w3.org/TR/html-markup/syntax.html#text-syntax + # + # Text content is everything leading up to certain special tags such as " { + callback("on_text", data, encoding, ts, mark); + + p = mark - 1; + mark = 0; + + fnext main; + }; + + # Just regular text. + allowed_text => { + callback("on_text", data, encoding, ts, te); + fnext main; + }; + *|; + # The main machine aka the entry point of Ragel. main := |* doctype_start => start_doctype; @@ -269,19 +316,8 @@ comment => start_comment; cdata => start_cdata; proc_ins_start => start_proc_ins; - - # The start of an element. - '<' => start_element; - - # Regular closing tags. - element_end => { - callback_simple("on_element_end"); - }; - - # Treat everything else, except for "<", as regular text. The "<" sign - # is used for tags so we can't emit text nodes for these characters. - any+ -- '<' => { - callback("on_text", data, encoding, ts, te); - }; + element_start => start_element; + element_end => close_element; + any => start_text; *|; }%%