Fixed lexing of XML comments.
The previous setup would consume too much. For example the following HTML: <a><!--foo--><b><!--bar--></b></a> would result in the following T_COMMENT token: "foo--><b><!--bar" The new setup requires the marking of a start position. I'm not a huge fan of this but there doesn't appear to be a way around this.
This commit is contained in:
parent
4d7f224892
commit
81edce2eb8
|
@ -77,11 +77,12 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
|
||||||
|
|
||||||
char *data_str_val = StringValuePtr(data_block);
|
char *data_str_val = StringValuePtr(data_block);
|
||||||
|
|
||||||
const char *p = data_str_val;
|
const char *p = data_str_val;
|
||||||
const char *pe = data_str_val + strlen(data_str_val);
|
const char *pe = data_str_val + strlen(data_str_val);
|
||||||
const char *eof = pe;
|
const char *eof = pe;
|
||||||
const char *ts = 0;
|
const char *ts = 0;
|
||||||
const char *te = 0;
|
const char *te = 0;
|
||||||
|
const char *mark = 0;
|
||||||
|
|
||||||
int act = NUM2INT(oga_ivar_get(self, "@act"));
|
int act = NUM2INT(oga_ivar_get(self, "@act"));
|
||||||
int cs = NUM2INT(oga_ivar_get(self, "@cs"));
|
int cs = NUM2INT(oga_ivar_get(self, "@cs"));
|
||||||
|
|
|
@ -90,11 +90,12 @@ public class Lexer extends RubyObject
|
||||||
|
|
||||||
byte[] data = rb_str.getBytes();
|
byte[] data = rb_str.getBytes();
|
||||||
|
|
||||||
int ts = 0;
|
int ts = 0;
|
||||||
int te = 0;
|
int te = 0;
|
||||||
int p = 0;
|
int p = 0;
|
||||||
int pe = data.length;
|
int mark = 0;
|
||||||
int eof = data.length;
|
int pe = data.length;
|
||||||
|
int eof = data.length;
|
||||||
|
|
||||||
%% write exec;
|
%% write exec;
|
||||||
|
|
||||||
|
|
|
@ -47,7 +47,27 @@
|
||||||
# such as `--` and `->`. Putting extra checks in for these sequences would
|
# such as `--` and `->`. Putting extra checks in for these sequences would
|
||||||
# actually make the rules/actions more complex.
|
# actually make the rules/actions more complex.
|
||||||
#
|
#
|
||||||
comment = '<!--' any* '-->';
|
|
||||||
|
comment_start = '<!--';
|
||||||
|
comment_end = '-->';
|
||||||
|
|
||||||
|
action start_comment {
|
||||||
|
mark = ts + 4;
|
||||||
|
|
||||||
|
fnext comment_body;
|
||||||
|
}
|
||||||
|
|
||||||
|
comment_body := |*
|
||||||
|
comment_end => {
|
||||||
|
callback("on_comment", data, encoding, mark, te - 3);
|
||||||
|
|
||||||
|
mark = 0;
|
||||||
|
|
||||||
|
fnext main;
|
||||||
|
};
|
||||||
|
|
||||||
|
any;
|
||||||
|
*|;
|
||||||
|
|
||||||
# CDATA
|
# CDATA
|
||||||
#
|
#
|
||||||
|
@ -215,10 +235,7 @@
|
||||||
main := |*
|
main := |*
|
||||||
doctype_start => start_doctype;
|
doctype_start => start_doctype;
|
||||||
xml_decl_start => start_xml_decl;
|
xml_decl_start => start_xml_decl;
|
||||||
|
comment_start => start_comment;
|
||||||
comment => {
|
|
||||||
callback("on_comment", data, encoding, ts + 4, te - 3);
|
|
||||||
};
|
|
||||||
|
|
||||||
cdata => {
|
cdata => {
|
||||||
callback("on_cdata", data, encoding, ts + 9, te - 3);
|
callback("on_cdata", data, encoding, ts + 9, te - 3);
|
||||||
|
|
|
@ -36,5 +36,18 @@ describe Oga::XML::Lexer do
|
||||||
[:T_COMMENT, '', 1]
|
[:T_COMMENT, '', 1]
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
example 'lex two comments following each other' do
|
||||||
|
lex('<a><!--foo--><b><!--bar--></b></a>').should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'a', 1],
|
||||||
|
[:T_COMMENT, 'foo', 1],
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'b', 1],
|
||||||
|
[:T_COMMENT, 'bar', 1],
|
||||||
|
[:T_ELEM_END, nil, 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue