From 81edce2eb857306082d3a93695a70ce70a9a8b28 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Fri, 15 Aug 2014 20:36:40 +0200 Subject: [PATCH] Fixed lexing of XML comments. The previous setup would consume too much. For example the following HTML: would result in the following T_COMMENT token: "foo-->'; + + comment_start = ''; + + action start_comment { + mark = ts + 4; + + fnext comment_body; + } + + comment_body := |* + comment_end => { + callback("on_comment", data, encoding, mark, te - 3); + + mark = 0; + + fnext main; + }; + + any; + *|; # CDATA # @@ -215,10 +235,7 @@ main := |* doctype_start => start_doctype; xml_decl_start => start_xml_decl; - - comment => { - callback("on_comment", data, encoding, ts + 4, te - 3); - }; + comment_start => start_comment; cdata => { callback("on_cdata", data, encoding, ts + 9, te - 3); diff --git a/spec/oga/xml/lexer/comments_spec.rb b/spec/oga/xml/lexer/comments_spec.rb index 0c3176e..34cb54b 100644 --- a/spec/oga/xml/lexer/comments_spec.rb +++ b/spec/oga/xml/lexer/comments_spec.rb @@ -36,5 +36,18 @@ describe Oga::XML::Lexer do [:T_COMMENT, '', 1] ] end + + example 'lex two comments following each other' do + lex('').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'a', 1], + [:T_COMMENT, 'foo', 1], + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'b', 1], + [:T_COMMENT, 'bar', 1], + [:T_ELEM_END, nil, 1], + [:T_ELEM_END, nil, 1] + ] + end end end