Fixed lexing of XML comments.
The previous setup would consume too much. For example the following HTML: <a><!--foo--><b><!--bar--></b></a> would result in the following T_COMMENT token: "foo--><b><!--bar" The new setup requires the marking of a start position. I'm not a huge fan of this but there doesn't appear to be a way around this.
This commit is contained in:
parent
4d7f224892
commit
81edce2eb8
|
@ -77,11 +77,12 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
|
|||
|
||||
char *data_str_val = StringValuePtr(data_block);
|
||||
|
||||
const char *p = data_str_val;
|
||||
const char *pe = data_str_val + strlen(data_str_val);
|
||||
const char *eof = pe;
|
||||
const char *ts = 0;
|
||||
const char *te = 0;
|
||||
const char *p = data_str_val;
|
||||
const char *pe = data_str_val + strlen(data_str_val);
|
||||
const char *eof = pe;
|
||||
const char *ts = 0;
|
||||
const char *te = 0;
|
||||
const char *mark = 0;
|
||||
|
||||
int act = NUM2INT(oga_ivar_get(self, "@act"));
|
||||
int cs = NUM2INT(oga_ivar_get(self, "@cs"));
|
||||
|
|
|
@ -90,11 +90,12 @@ public class Lexer extends RubyObject
|
|||
|
||||
byte[] data = rb_str.getBytes();
|
||||
|
||||
int ts = 0;
|
||||
int te = 0;
|
||||
int p = 0;
|
||||
int pe = data.length;
|
||||
int eof = data.length;
|
||||
int ts = 0;
|
||||
int te = 0;
|
||||
int p = 0;
|
||||
int mark = 0;
|
||||
int pe = data.length;
|
||||
int eof = data.length;
|
||||
|
||||
%% write exec;
|
||||
|
||||
|
|
|
@ -47,7 +47,27 @@
|
|||
# such as `--` and `->`. Putting extra checks in for these sequences would
|
||||
# actually make the rules/actions more complex.
|
||||
#
|
||||
comment = '<!--' any* '-->';
|
||||
|
||||
comment_start = '<!--';
|
||||
comment_end = '-->';
|
||||
|
||||
action start_comment {
|
||||
mark = ts + 4;
|
||||
|
||||
fnext comment_body;
|
||||
}
|
||||
|
||||
comment_body := |*
|
||||
comment_end => {
|
||||
callback("on_comment", data, encoding, mark, te - 3);
|
||||
|
||||
mark = 0;
|
||||
|
||||
fnext main;
|
||||
};
|
||||
|
||||
any;
|
||||
*|;
|
||||
|
||||
# CDATA
|
||||
#
|
||||
|
@ -215,10 +235,7 @@
|
|||
main := |*
|
||||
doctype_start => start_doctype;
|
||||
xml_decl_start => start_xml_decl;
|
||||
|
||||
comment => {
|
||||
callback("on_comment", data, encoding, ts + 4, te - 3);
|
||||
};
|
||||
comment_start => start_comment;
|
||||
|
||||
cdata => {
|
||||
callback("on_cdata", data, encoding, ts + 9, te - 3);
|
||||
|
|
|
@ -36,5 +36,18 @@ describe Oga::XML::Lexer do
|
|||
[:T_COMMENT, '', 1]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex two comments following each other' do
|
||||
lex('<a><!--foo--><b><!--bar--></b></a>').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'a', 1],
|
||||
[:T_COMMENT, 'foo', 1],
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'b', 1],
|
||||
[:T_COMMENT, 'bar', 1],
|
||||
[:T_ELEM_END, nil, 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue