Lex comments in chunks

Similar to this being added for CDATA tags in
8acc7fc743 comments are now also lexed in
chunks.

Related issue: #93
This commit is contained in:
Yorick Peterse 2015-04-14 23:11:22 +02:00
parent 8acc7fc743
commit ea8b4aa92f
7 changed files with 129 additions and 18 deletions

View File

@ -93,7 +93,9 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
ID id_on_cdata_start = rb_intern("on_cdata_start"); ID id_on_cdata_start = rb_intern("on_cdata_start");
ID id_on_cdata_body = rb_intern("on_cdata_body"); ID id_on_cdata_body = rb_intern("on_cdata_body");
ID id_on_cdata_end = rb_intern("on_cdata_end"); ID id_on_cdata_end = rb_intern("on_cdata_end");
ID id_on_comment = rb_intern("on_comment"); ID id_on_comment_start = rb_intern("on_comment_start");
ID id_on_comment_body = rb_intern("on_comment_body");
ID id_on_comment_end = rb_intern("on_comment_end");
ID id_on_doctype_end = rb_intern("on_doctype_end"); ID id_on_doctype_end = rb_intern("on_doctype_end");
ID id_on_doctype_inline = rb_intern("on_doctype_inline"); ID id_on_doctype_inline = rb_intern("on_doctype_inline");
ID id_on_doctype_name = rb_intern("on_doctype_name"); ID id_on_doctype_name = rb_intern("on_doctype_name");

View File

@ -107,7 +107,9 @@ public class Lexer extends RubyObject
String id_on_cdata_start = "on_cdata_start"; String id_on_cdata_start = "on_cdata_start";
String id_on_cdata_body = "on_cdata_body"; String id_on_cdata_body = "on_cdata_body";
String id_on_cdata_end = "on_cdata_end"; String id_on_cdata_end = "on_cdata_end";
String id_on_comment = "on_comment"; String id_on_comment_start = "on_comment_start";
String id_on_comment_body = "on_comment_body";
String id_on_comment_end = "on_comment_end";
String id_on_doctype_end = "on_doctype_end"; String id_on_doctype_end = "on_doctype_end";
String id_on_doctype_inline = "on_doctype_inline"; String id_on_doctype_inline = "on_doctype_inline";
String id_on_doctype_name = "on_doctype_name"; String id_on_doctype_name = "on_doctype_name";

View File

@ -67,12 +67,35 @@
comment_start = '<!--'; comment_start = '<!--';
comment_end = '-->'; comment_end = '-->';
comment = comment_start (any* -- comment_end) comment_end;
# Everything except "-" OR a single "-"
comment_allowed = (^'-'+ | '-') $count_newlines;
action start_comment { action start_comment {
callback(id_on_comment, data, encoding, ts + 4, te - 3); callback_simple(id_on_comment_start);
fnext comment_body;
} }
comment_body := |*
comment_allowed => {
callback(id_on_comment_body, data, encoding, ts, te);
if ( lines > 0 )
{
advance_line(lines);
lines = 0;
}
};
comment_end => {
callback_simple(id_on_comment_end);
fnext main;
};
*|;
# CDATA # CDATA
# #
# http://www.w3.org/TR/html-markup/syntax.html#cdata-sections # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
@ -461,7 +484,7 @@
main := |* main := |*
doctype_start => start_doctype; doctype_start => start_doctype;
xml_decl_start => start_xml_decl; xml_decl_start => start_xml_decl;
comment => start_comment; comment_start => start_comment;
cdata_start => start_cdata; cdata_start => start_cdata;
proc_ins_start => start_proc_ins; proc_ins_start => start_proc_ins;
element_start => start_element; element_start => start_element;

View File

@ -284,13 +284,27 @@ module Oga
add_token(:T_CDATA_BODY, value) add_token(:T_CDATA_BODY, value)
end end
##
# Called on the open comment tag.
#
def on_comment_start
add_token(:T_COMMENT_START)
end
##
# Called on the closing comment tag.
#
def on_comment_end
add_token(:T_COMMENT_END)
end
## ##
# Called on a comment. # Called on a comment.
# #
# @param [String] value # @param [String] value
# #
def on_comment(value) def on_comment_body(value)
add_token(:T_COMMENT, value) add_token(:T_COMMENT_BODY, value)
end end
## ##

View File

@ -24,7 +24,8 @@
%terminals T_TEXT T_STRING_SQUOTE T_STRING_DQUOTE T_STRING_BODY; %terminals T_TEXT T_STRING_SQUOTE T_STRING_DQUOTE T_STRING_BODY;
%terminals T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME; %terminals T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME;
%terminals T_DOCTYPE_INLINE T_COMMENT; %terminals T_DOCTYPE_INLINE;
%terminals T_COMMENT_START T_COMMENT_BODY T_COMMENT_END;
%terminals T_CDATA_START T_CDATA_BODY T_CDATA_END; %terminals T_CDATA_START T_CDATA_BODY T_CDATA_END;
%terminals T_ELEM_START T_ELEM_NAME T_ELEM_NS T_ELEM_END T_ATTR T_ATTR_NS; %terminals T_ELEM_START T_ELEM_NAME T_ELEM_NS T_ELEM_END T_ATTR T_ATTR_NS;
%terminals T_XML_DECL_START T_XML_DECL_END; %terminals T_XML_DECL_START T_XML_DECL_END;
@ -105,7 +106,12 @@ cdata_body
# Comments # Comments
comment comment
= T_COMMENT { on_comment(val[0]) } = T_COMMENT_START comment_body T_COMMENT_END { on_comment(val[1]) }
;
comment_body
= T_COMMENT_BODY comment_body { val[0] + val[1] }
| _ { '' }
; ;
# Processing Instructions # Processing Instructions

View File

@ -3,20 +3,48 @@ require 'spec_helper'
describe Oga::XML::Lexer do describe Oga::XML::Lexer do
describe 'comments' do describe 'comments' do
it 'lexes a comment' do it 'lexes a comment' do
lex('<!-- foo -->').should == [[:T_COMMENT, ' foo ', 1]] lex('<!-- foo -->').should == [
[:T_COMMENT_START, nil, 1],
[:T_COMMENT_BODY, ' foo ', 1],
[:T_COMMENT_END, nil, 1]
]
end
it 'lexes a comment containing -' do
lex('<!-- - -->').should == [
[:T_COMMENT_START, nil, 1],
[:T_COMMENT_BODY, ' ', 1],
[:T_COMMENT_BODY, '-', 1],
[:T_COMMENT_BODY, ' ', 1],
[:T_COMMENT_END, nil, 1],
]
end end
it 'lexes a comment containing --' do it 'lexes a comment containing --' do
lex('<!-- -- -->').should == [[:T_COMMENT, ' -- ', 1]] lex('<!-- -- -->').should == [
[:T_COMMENT_START, nil, 1],
[:T_COMMENT_BODY, ' ', 1],
[:T_COMMENT_BODY, '-', 1],
[:T_COMMENT_BODY, '-', 1],
[:T_COMMENT_BODY, ' ', 1],
[:T_COMMENT_END, nil, 1]
]
end end
it 'lexes a comment containing ->' do it 'lexes a comment containing ->' do
lex('<!-- -> -->').should == [[:T_COMMENT, ' -> ', 1]] lex('<!-- -> -->').should == [
[:T_COMMENT_START, nil, 1],
[:T_COMMENT_BODY, ' ', 1],
[:T_COMMENT_BODY, '-', 1],
[:T_COMMENT_BODY, '> ', 1],
[:T_COMMENT_END, nil, 1]
]
end end
it 'lexes a comment followed by text' do it 'lexes a comment followed by text' do
lex('<!---->foo').should == [ lex('<!---->foo').should == [
[:T_COMMENT, '', 1], [:T_COMMENT_START, nil, 1],
[:T_COMMENT_END, nil, 1],
[:T_TEXT, 'foo', 1] [:T_TEXT, 'foo', 1]
] ]
end end
@ -24,7 +52,8 @@ describe Oga::XML::Lexer do
it 'lexes text followed by a comment' do it 'lexes text followed by a comment' do
lex('foo<!---->').should == [ lex('foo<!---->').should == [
[:T_TEXT, 'foo', 1], [:T_TEXT, 'foo', 1],
[:T_COMMENT, '', 1] [:T_COMMENT_START, nil, 1],
[:T_COMMENT_END, nil, 1]
] ]
end end
@ -33,7 +62,8 @@ describe Oga::XML::Lexer do
[:T_ELEM_START, nil, 1], [:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'p', 1], [:T_ELEM_NAME, 'p', 1],
[:T_ELEM_END, nil, 1], [:T_ELEM_END, nil, 1],
[:T_COMMENT, '', 1] [:T_COMMENT_START, nil, 1],
[:T_COMMENT_END, nil, 1]
] ]
end end
@ -41,13 +71,45 @@ describe Oga::XML::Lexer do
lex('<a><!--foo--><b><!--bar--></b></a>').should == [ lex('<a><!--foo--><b><!--bar--></b></a>').should == [
[:T_ELEM_START, nil, 1], [:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'a', 1], [:T_ELEM_NAME, 'a', 1],
[:T_COMMENT, 'foo', 1], [:T_COMMENT_START, nil, 1],
[:T_COMMENT_BODY, 'foo', 1],
[:T_COMMENT_END, nil, 1],
[:T_ELEM_START, nil, 1], [:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'b', 1], [:T_ELEM_NAME, 'b', 1],
[:T_COMMENT, 'bar', 1], [:T_COMMENT_START, nil, 1],
[:T_COMMENT_BODY, 'bar', 1],
[:T_COMMENT_END, nil, 1],
[:T_ELEM_END, nil, 1], [:T_ELEM_END, nil, 1],
[:T_ELEM_END, nil, 1] [:T_ELEM_END, nil, 1]
] ]
end end
describe 'using an IO as input' do
it 'lexes a comment containing a newline after the open tag' do
lex_stringio("<!--\nfoo-->").should == [
[:T_COMMENT_START, nil, 1],
[:T_COMMENT_BODY, "\n", 1],
[:T_COMMENT_BODY, "foo", 2],
[:T_COMMENT_END, nil, 2]
]
end
it 'lexes a comment containing a newline before the closing tag' do
lex_stringio("<!--foo\n-->").should == [
[:T_COMMENT_START, nil, 1],
[:T_COMMENT_BODY, "foo\n", 1],
[:T_COMMENT_END, nil, 2]
]
end
it 'lexes a comment with the body surrounded by newlines' do
lex_stringio("<!--\nfoo\n-->").should == [
[:T_COMMENT_START, nil, 1],
[:T_COMMENT_BODY, "\n", 1],
[:T_COMMENT_BODY, "foo\n", 2],
[:T_COMMENT_END, nil, 3]
]
end
end
end end
end end

View File

@ -20,7 +20,9 @@ describe Oga::XML::Lexer do
[:T_ELEM_START, nil, 1], [:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'script', 1], [:T_ELEM_NAME, 'script', 1],
[:T_TEXT, @javascript, 1], [:T_TEXT, @javascript, 1],
[:T_COMMENT, 'foo', 1], [:T_COMMENT_START, nil, 1],
[:T_COMMENT_BODY, 'foo', 1],
[:T_COMMENT_END, nil, 1],
[:T_ELEM_END, nil, 1] [:T_ELEM_END, nil, 1]
] ]
end end