Lex comments in chunks
Similar to this being added for CDATA tags in
8acc7fc743
comments are now also lexed in
chunks.
Related issue: #93
This commit is contained in:
parent
8acc7fc743
commit
ea8b4aa92f
|
@ -93,7 +93,9 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
|
|||
ID id_on_cdata_start = rb_intern("on_cdata_start");
|
||||
ID id_on_cdata_body = rb_intern("on_cdata_body");
|
||||
ID id_on_cdata_end = rb_intern("on_cdata_end");
|
||||
ID id_on_comment = rb_intern("on_comment");
|
||||
ID id_on_comment_start = rb_intern("on_comment_start");
|
||||
ID id_on_comment_body = rb_intern("on_comment_body");
|
||||
ID id_on_comment_end = rb_intern("on_comment_end");
|
||||
ID id_on_doctype_end = rb_intern("on_doctype_end");
|
||||
ID id_on_doctype_inline = rb_intern("on_doctype_inline");
|
||||
ID id_on_doctype_name = rb_intern("on_doctype_name");
|
||||
|
|
|
@ -107,7 +107,9 @@ public class Lexer extends RubyObject
|
|||
String id_on_cdata_start = "on_cdata_start";
|
||||
String id_on_cdata_body = "on_cdata_body";
|
||||
String id_on_cdata_end = "on_cdata_end";
|
||||
String id_on_comment = "on_comment";
|
||||
String id_on_comment_start = "on_comment_start";
|
||||
String id_on_comment_body = "on_comment_body";
|
||||
String id_on_comment_end = "on_comment_end";
|
||||
String id_on_doctype_end = "on_doctype_end";
|
||||
String id_on_doctype_inline = "on_doctype_inline";
|
||||
String id_on_doctype_name = "on_doctype_name";
|
||||
|
|
|
@ -67,12 +67,35 @@
|
|||
|
||||
comment_start = '<!--';
|
||||
comment_end = '-->';
|
||||
comment = comment_start (any* -- comment_end) comment_end;
|
||||
|
||||
# Everything except "-" OR a single "-"
|
||||
comment_allowed = (^'-'+ | '-') $count_newlines;
|
||||
|
||||
action start_comment {
|
||||
callback(id_on_comment, data, encoding, ts + 4, te - 3);
|
||||
callback_simple(id_on_comment_start);
|
||||
|
||||
fnext comment_body;
|
||||
}
|
||||
|
||||
comment_body := |*
|
||||
comment_allowed => {
|
||||
callback(id_on_comment_body, data, encoding, ts, te);
|
||||
|
||||
if ( lines > 0 )
|
||||
{
|
||||
advance_line(lines);
|
||||
|
||||
lines = 0;
|
||||
}
|
||||
};
|
||||
|
||||
comment_end => {
|
||||
callback_simple(id_on_comment_end);
|
||||
|
||||
fnext main;
|
||||
};
|
||||
*|;
|
||||
|
||||
# CDATA
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
|
||||
|
@ -461,7 +484,7 @@
|
|||
main := |*
|
||||
doctype_start => start_doctype;
|
||||
xml_decl_start => start_xml_decl;
|
||||
comment => start_comment;
|
||||
comment_start => start_comment;
|
||||
cdata_start => start_cdata;
|
||||
proc_ins_start => start_proc_ins;
|
||||
element_start => start_element;
|
||||
|
|
|
@ -284,13 +284,27 @@ module Oga
|
|||
add_token(:T_CDATA_BODY, value)
|
||||
end
|
||||
|
||||
##
|
||||
# Called on the open comment tag.
|
||||
#
|
||||
def on_comment_start
|
||||
add_token(:T_COMMENT_START)
|
||||
end
|
||||
|
||||
##
|
||||
# Called on the closing comment tag.
|
||||
#
|
||||
def on_comment_end
|
||||
add_token(:T_COMMENT_END)
|
||||
end
|
||||
|
||||
##
|
||||
# Called on a comment.
|
||||
#
|
||||
# @param [String] value
|
||||
#
|
||||
def on_comment(value)
|
||||
add_token(:T_COMMENT, value)
|
||||
def on_comment_body(value)
|
||||
add_token(:T_COMMENT_BODY, value)
|
||||
end
|
||||
|
||||
##
|
||||
|
|
|
@ -24,7 +24,8 @@
|
|||
|
||||
%terminals T_TEXT T_STRING_SQUOTE T_STRING_DQUOTE T_STRING_BODY;
|
||||
%terminals T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME;
|
||||
%terminals T_DOCTYPE_INLINE T_COMMENT;
|
||||
%terminals T_DOCTYPE_INLINE;
|
||||
%terminals T_COMMENT_START T_COMMENT_BODY T_COMMENT_END;
|
||||
%terminals T_CDATA_START T_CDATA_BODY T_CDATA_END;
|
||||
%terminals T_ELEM_START T_ELEM_NAME T_ELEM_NS T_ELEM_END T_ATTR T_ATTR_NS;
|
||||
%terminals T_XML_DECL_START T_XML_DECL_END;
|
||||
|
@ -105,7 +106,12 @@ cdata_body
|
|||
# Comments
|
||||
|
||||
comment
|
||||
= T_COMMENT { on_comment(val[0]) }
|
||||
= T_COMMENT_START comment_body T_COMMENT_END { on_comment(val[1]) }
|
||||
;
|
||||
|
||||
comment_body
|
||||
= T_COMMENT_BODY comment_body { val[0] + val[1] }
|
||||
| _ { '' }
|
||||
;
|
||||
|
||||
# Processing Instructions
|
||||
|
|
|
@ -3,20 +3,48 @@ require 'spec_helper'
|
|||
describe Oga::XML::Lexer do
|
||||
describe 'comments' do
|
||||
it 'lexes a comment' do
|
||||
lex('<!-- foo -->').should == [[:T_COMMENT, ' foo ', 1]]
|
||||
lex('<!-- foo -->').should == [
|
||||
[:T_COMMENT_START, nil, 1],
|
||||
[:T_COMMENT_BODY, ' foo ', 1],
|
||||
[:T_COMMENT_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes a comment containing -' do
|
||||
lex('<!-- - -->').should == [
|
||||
[:T_COMMENT_START, nil, 1],
|
||||
[:T_COMMENT_BODY, ' ', 1],
|
||||
[:T_COMMENT_BODY, '-', 1],
|
||||
[:T_COMMENT_BODY, ' ', 1],
|
||||
[:T_COMMENT_END, nil, 1],
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes a comment containing --' do
|
||||
lex('<!-- -- -->').should == [[:T_COMMENT, ' -- ', 1]]
|
||||
lex('<!-- -- -->').should == [
|
||||
[:T_COMMENT_START, nil, 1],
|
||||
[:T_COMMENT_BODY, ' ', 1],
|
||||
[:T_COMMENT_BODY, '-', 1],
|
||||
[:T_COMMENT_BODY, '-', 1],
|
||||
[:T_COMMENT_BODY, ' ', 1],
|
||||
[:T_COMMENT_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes a comment containing ->' do
|
||||
lex('<!-- -> -->').should == [[:T_COMMENT, ' -> ', 1]]
|
||||
lex('<!-- -> -->').should == [
|
||||
[:T_COMMENT_START, nil, 1],
|
||||
[:T_COMMENT_BODY, ' ', 1],
|
||||
[:T_COMMENT_BODY, '-', 1],
|
||||
[:T_COMMENT_BODY, '> ', 1],
|
||||
[:T_COMMENT_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes a comment followed by text' do
|
||||
lex('<!---->foo').should == [
|
||||
[:T_COMMENT, '', 1],
|
||||
[:T_COMMENT_START, nil, 1],
|
||||
[:T_COMMENT_END, nil, 1],
|
||||
[:T_TEXT, 'foo', 1]
|
||||
]
|
||||
end
|
||||
|
@ -24,7 +52,8 @@ describe Oga::XML::Lexer do
|
|||
it 'lexes text followed by a comment' do
|
||||
lex('foo<!---->').should == [
|
||||
[:T_TEXT, 'foo', 1],
|
||||
[:T_COMMENT, '', 1]
|
||||
[:T_COMMENT_START, nil, 1],
|
||||
[:T_COMMENT_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
|
@ -33,7 +62,8 @@ describe Oga::XML::Lexer do
|
|||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'p', 1],
|
||||
[:T_ELEM_END, nil, 1],
|
||||
[:T_COMMENT, '', 1]
|
||||
[:T_COMMENT_START, nil, 1],
|
||||
[:T_COMMENT_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
|
@ -41,13 +71,45 @@ describe Oga::XML::Lexer do
|
|||
lex('<a><!--foo--><b><!--bar--></b></a>').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'a', 1],
|
||||
[:T_COMMENT, 'foo', 1],
|
||||
[:T_COMMENT_START, nil, 1],
|
||||
[:T_COMMENT_BODY, 'foo', 1],
|
||||
[:T_COMMENT_END, nil, 1],
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'b', 1],
|
||||
[:T_COMMENT, 'bar', 1],
|
||||
[:T_COMMENT_START, nil, 1],
|
||||
[:T_COMMENT_BODY, 'bar', 1],
|
||||
[:T_COMMENT_END, nil, 1],
|
||||
[:T_ELEM_END, nil, 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
describe 'using an IO as input' do
|
||||
it 'lexes a comment containing a newline after the open tag' do
|
||||
lex_stringio("<!--\nfoo-->").should == [
|
||||
[:T_COMMENT_START, nil, 1],
|
||||
[:T_COMMENT_BODY, "\n", 1],
|
||||
[:T_COMMENT_BODY, "foo", 2],
|
||||
[:T_COMMENT_END, nil, 2]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes a comment containing a newline before the closing tag' do
|
||||
lex_stringio("<!--foo\n-->").should == [
|
||||
[:T_COMMENT_START, nil, 1],
|
||||
[:T_COMMENT_BODY, "foo\n", 1],
|
||||
[:T_COMMENT_END, nil, 2]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes a comment with the body surrounded by newlines' do
|
||||
lex_stringio("<!--\nfoo\n-->").should == [
|
||||
[:T_COMMENT_START, nil, 1],
|
||||
[:T_COMMENT_BODY, "\n", 1],
|
||||
[:T_COMMENT_BODY, "foo\n", 2],
|
||||
[:T_COMMENT_END, nil, 3]
|
||||
]
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -20,7 +20,9 @@ describe Oga::XML::Lexer do
|
|||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'script', 1],
|
||||
[:T_TEXT, @javascript, 1],
|
||||
[:T_COMMENT, 'foo', 1],
|
||||
[:T_COMMENT_START, nil, 1],
|
||||
[:T_COMMENT_BODY, 'foo', 1],
|
||||
[:T_COMMENT_END, nil, 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue