From ea8b4aa92fe746a9da19e94c3edf68b41495d992 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Tue, 14 Apr 2015 23:11:22 +0200 Subject: [PATCH] Lex comments in chunks Similar to this being added for CDATA tags in 8acc7fc743c9492eed2d9c885c22c1b5bec06d0f comments are now also lexed in chunks. Related issue: #93 --- ext/c/lexer.rl | 4 +- ext/java/org/liboga/xml/Lexer.rl | 4 +- ext/ragel/base_lexer.rl | 29 +++++++- lib/oga/xml/lexer.rb | 18 ++++- lib/oga/xml/parser.rll | 10 ++- spec/oga/xml/lexer/comments_spec.rb | 78 ++++++++++++++++++-- spec/oga/xml/lexer/inline_javascript_spec.rb | 4 +- 7 files changed, 129 insertions(+), 18 deletions(-) diff --git a/ext/c/lexer.rl b/ext/c/lexer.rl index db0037d..22d15bc 100644 --- a/ext/c/lexer.rl +++ b/ext/c/lexer.rl @@ -93,7 +93,9 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block) ID id_on_cdata_start = rb_intern("on_cdata_start"); ID id_on_cdata_body = rb_intern("on_cdata_body"); ID id_on_cdata_end = rb_intern("on_cdata_end"); - ID id_on_comment = rb_intern("on_comment"); + ID id_on_comment_start = rb_intern("on_comment_start"); + ID id_on_comment_body = rb_intern("on_comment_body"); + ID id_on_comment_end = rb_intern("on_comment_end"); ID id_on_doctype_end = rb_intern("on_doctype_end"); ID id_on_doctype_inline = rb_intern("on_doctype_inline"); ID id_on_doctype_name = rb_intern("on_doctype_name"); diff --git a/ext/java/org/liboga/xml/Lexer.rl b/ext/java/org/liboga/xml/Lexer.rl index 738ce85..7ef24e0 100644 --- a/ext/java/org/liboga/xml/Lexer.rl +++ b/ext/java/org/liboga/xml/Lexer.rl @@ -107,7 +107,9 @@ public class Lexer extends RubyObject String id_on_cdata_start = "on_cdata_start"; String id_on_cdata_body = "on_cdata_body"; String id_on_cdata_end = "on_cdata_end"; - String id_on_comment = "on_comment"; + String id_on_comment_start = "on_comment_start"; + String id_on_comment_body = "on_comment_body"; + String id_on_comment_end = "on_comment_end"; String id_on_doctype_end = "on_doctype_end"; String id_on_doctype_inline = "on_doctype_inline"; String id_on_doctype_name = "on_doctype_name"; diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index a0721e7..bcabab4 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -67,12 +67,35 @@ comment_start = ''; - comment = comment_start (any* -- comment_end) comment_end; + + # Everything except "-" OR a single "-" + comment_allowed = (^'-'+ | '-') $count_newlines; action start_comment { - callback(id_on_comment, data, encoding, ts + 4, te - 3); + callback_simple(id_on_comment_start); + + fnext comment_body; } + comment_body := |* + comment_allowed => { + callback(id_on_comment_body, data, encoding, ts, te); + + if ( lines > 0 ) + { + advance_line(lines); + + lines = 0; + } + }; + + comment_end => { + callback_simple(id_on_comment_end); + + fnext main; + }; + *|; + # CDATA # # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections @@ -461,7 +484,7 @@ main := |* doctype_start => start_doctype; xml_decl_start => start_xml_decl; - comment => start_comment; + comment_start => start_comment; cdata_start => start_cdata; proc_ins_start => start_proc_ins; element_start => start_element; diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb index d5f51d2..a0fc1c8 100644 --- a/lib/oga/xml/lexer.rb +++ b/lib/oga/xml/lexer.rb @@ -284,13 +284,27 @@ module Oga add_token(:T_CDATA_BODY, value) end + ## + # Called on the open comment tag. + # + def on_comment_start + add_token(:T_COMMENT_START) + end + + ## + # Called on the closing comment tag. + # + def on_comment_end + add_token(:T_COMMENT_END) + end + ## # Called on a comment. # # @param [String] value # - def on_comment(value) - add_token(:T_COMMENT, value) + def on_comment_body(value) + add_token(:T_COMMENT_BODY, value) end ## diff --git a/lib/oga/xml/parser.rll b/lib/oga/xml/parser.rll index fd33903..b2c7d0a 100644 --- a/lib/oga/xml/parser.rll +++ b/lib/oga/xml/parser.rll @@ -24,7 +24,8 @@ %terminals T_TEXT T_STRING_SQUOTE T_STRING_DQUOTE T_STRING_BODY; %terminals T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME; -%terminals T_DOCTYPE_INLINE T_COMMENT; +%terminals T_DOCTYPE_INLINE; +%terminals T_COMMENT_START T_COMMENT_BODY T_COMMENT_END; %terminals T_CDATA_START T_CDATA_BODY T_CDATA_END; %terminals T_ELEM_START T_ELEM_NAME T_ELEM_NS T_ELEM_END T_ATTR T_ATTR_NS; %terminals T_XML_DECL_START T_XML_DECL_END; @@ -105,7 +106,12 @@ cdata_body # Comments comment - = T_COMMENT { on_comment(val[0]) } + = T_COMMENT_START comment_body T_COMMENT_END { on_comment(val[1]) } + ; + +comment_body + = T_COMMENT_BODY comment_body { val[0] + val[1] } + | _ { '' } ; # Processing Instructions diff --git a/spec/oga/xml/lexer/comments_spec.rb b/spec/oga/xml/lexer/comments_spec.rb index fe29603..772ab5b 100644 --- a/spec/oga/xml/lexer/comments_spec.rb +++ b/spec/oga/xml/lexer/comments_spec.rb @@ -3,20 +3,48 @@ require 'spec_helper' describe Oga::XML::Lexer do describe 'comments' do it 'lexes a comment' do - lex('').should == [[:T_COMMENT, ' foo ', 1]] + lex('').should == [ + [:T_COMMENT_START, nil, 1], + [:T_COMMENT_BODY, ' foo ', 1], + [:T_COMMENT_END, nil, 1] + ] + end + + it 'lexes a comment containing -' do + lex('').should == [ + [:T_COMMENT_START, nil, 1], + [:T_COMMENT_BODY, ' ', 1], + [:T_COMMENT_BODY, '-', 1], + [:T_COMMENT_BODY, ' ', 1], + [:T_COMMENT_END, nil, 1], + ] end it 'lexes a comment containing --' do - lex('').should == [[:T_COMMENT, ' -- ', 1]] + lex('').should == [ + [:T_COMMENT_START, nil, 1], + [:T_COMMENT_BODY, ' ', 1], + [:T_COMMENT_BODY, '-', 1], + [:T_COMMENT_BODY, '-', 1], + [:T_COMMENT_BODY, ' ', 1], + [:T_COMMENT_END, nil, 1] + ] end it 'lexes a comment containing ->' do - lex('').should == [[:T_COMMENT, ' -> ', 1]] + lex('').should == [ + [:T_COMMENT_START, nil, 1], + [:T_COMMENT_BODY, ' ', 1], + [:T_COMMENT_BODY, '-', 1], + [:T_COMMENT_BODY, '> ', 1], + [:T_COMMENT_END, nil, 1] + ] end it 'lexes a comment followed by text' do lex('foo').should == [ - [:T_COMMENT, '', 1], + [:T_COMMENT_START, nil, 1], + [:T_COMMENT_END, nil, 1], [:T_TEXT, 'foo', 1] ] end @@ -24,7 +52,8 @@ describe Oga::XML::Lexer do it 'lexes text followed by a comment' do lex('foo').should == [ [:T_TEXT, 'foo', 1], - [:T_COMMENT, '', 1] + [:T_COMMENT_START, nil, 1], + [:T_COMMENT_END, nil, 1] ] end @@ -33,7 +62,8 @@ describe Oga::XML::Lexer do [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'p', 1], [:T_ELEM_END, nil, 1], - [:T_COMMENT, '', 1] + [:T_COMMENT_START, nil, 1], + [:T_COMMENT_END, nil, 1] ] end @@ -41,13 +71,45 @@ describe Oga::XML::Lexer do lex('').should == [ [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'a', 1], - [:T_COMMENT, 'foo', 1], + [:T_COMMENT_START, nil, 1], + [:T_COMMENT_BODY, 'foo', 1], + [:T_COMMENT_END, nil, 1], [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'b', 1], - [:T_COMMENT, 'bar', 1], + [:T_COMMENT_START, nil, 1], + [:T_COMMENT_BODY, 'bar', 1], + [:T_COMMENT_END, nil, 1], [:T_ELEM_END, nil, 1], [:T_ELEM_END, nil, 1] ] end + + describe 'using an IO as input' do + it 'lexes a comment containing a newline after the open tag' do + lex_stringio("").should == [ + [:T_COMMENT_START, nil, 1], + [:T_COMMENT_BODY, "\n", 1], + [:T_COMMENT_BODY, "foo", 2], + [:T_COMMENT_END, nil, 2] + ] + end + + it 'lexes a comment containing a newline before the closing tag' do + lex_stringio("").should == [ + [:T_COMMENT_START, nil, 1], + [:T_COMMENT_BODY, "foo\n", 1], + [:T_COMMENT_END, nil, 2] + ] + end + + it 'lexes a comment with the body surrounded by newlines' do + lex_stringio("").should == [ + [:T_COMMENT_START, nil, 1], + [:T_COMMENT_BODY, "\n", 1], + [:T_COMMENT_BODY, "foo\n", 2], + [:T_COMMENT_END, nil, 3] + ] + end + end end end diff --git a/spec/oga/xml/lexer/inline_javascript_spec.rb b/spec/oga/xml/lexer/inline_javascript_spec.rb index e1d7295..36cf4d2 100644 --- a/spec/oga/xml/lexer/inline_javascript_spec.rb +++ b/spec/oga/xml/lexer/inline_javascript_spec.rb @@ -20,7 +20,9 @@ describe Oga::XML::Lexer do [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'script', 1], [:T_TEXT, @javascript, 1], - [:T_COMMENT, 'foo', 1], + [:T_COMMENT_START, nil, 1], + [:T_COMMENT_BODY, 'foo', 1], + [:T_COMMENT_END, nil, 1], [:T_ELEM_END, nil, 1] ] end