Fixed lexing of XML comments.

The previous setup would consume too much. For example the following HTML:

    <a><!--foo--><b><!--bar--></b></a>

would result in the following T_COMMENT token:

    "foo--><b><!--bar"

The new setup requires the marking of a start position. I'm not a huge fan of
this but there doesn't appear to be a way around this.
This commit is contained in:
Yorick Peterse 2014-08-15 20:36:40 +02:00
parent 4d7f224892
commit 81edce2eb8
4 changed files with 47 additions and 15 deletions

View File

@ -77,11 +77,12 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
char *data_str_val = StringValuePtr(data_block);
const char *p = data_str_val;
const char *pe = data_str_val + strlen(data_str_val);
const char *eof = pe;
const char *ts = 0;
const char *te = 0;
const char *p = data_str_val;
const char *pe = data_str_val + strlen(data_str_val);
const char *eof = pe;
const char *ts = 0;
const char *te = 0;
const char *mark = 0;
int act = NUM2INT(oga_ivar_get(self, "@act"));
int cs = NUM2INT(oga_ivar_get(self, "@cs"));

View File

@ -90,11 +90,12 @@ public class Lexer extends RubyObject
byte[] data = rb_str.getBytes();
int ts = 0;
int te = 0;
int p = 0;
int pe = data.length;
int eof = data.length;
int ts = 0;
int te = 0;
int p = 0;
int mark = 0;
int pe = data.length;
int eof = data.length;
%% write exec;

View File

@ -47,7 +47,27 @@
# such as `--` and `->`. Putting extra checks in for these sequences would
# actually make the rules/actions more complex.
#
comment = '<!--' any* '-->';
comment_start = '<!--';
comment_end = '-->';
action start_comment {
mark = ts + 4;
fnext comment_body;
}
comment_body := |*
comment_end => {
callback("on_comment", data, encoding, mark, te - 3);
mark = 0;
fnext main;
};
any;
*|;
# CDATA
#
@ -215,10 +235,7 @@
main := |*
doctype_start => start_doctype;
xml_decl_start => start_xml_decl;
comment => {
callback("on_comment", data, encoding, ts + 4, te - 3);
};
comment_start => start_comment;
cdata => {
callback("on_cdata", data, encoding, ts + 9, te - 3);

View File

@ -36,5 +36,18 @@ describe Oga::XML::Lexer do
[:T_COMMENT, '', 1]
]
end
example 'lex two comments following each other' do
lex('<a><!--foo--><b><!--bar--></b></a>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'a', 1],
[:T_COMMENT, 'foo', 1],
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'b', 1],
[:T_COMMENT, 'bar', 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
end
end