From 4e8cca258ce1ff30506c89e857eea4698bcbc3f0 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Fri, 15 Aug 2014 20:47:58 +0200 Subject: [PATCH] Fixed lexing of XML CDATA tags. --- ext/ragel/base_lexer.rl | 27 ++++++++++++++++++++++----- spec/oga/xml/lexer/cdata_spec.rb | 13 +++++++++++++ 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index c095a00..3f59b46 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -76,7 +76,27 @@ # In HTML CDATA tags have no meaning/are not supported. Oga does # support them but treats their contents as plain text. # - cdata = ''; + + cdata_start = ''; + + action start_cdata { + mark = ts + 9; + + fnext cdata_body; + } + + cdata_body := |* + cdata_end => { + callback("on_cdata", data, encoding, mark, te - 3); + + mark = 0; + + fnext main; + }; + + any; + *|; # Strings # @@ -236,10 +256,7 @@ doctype_start => start_doctype; xml_decl_start => start_xml_decl; comment_start => start_comment; - - cdata => { - callback("on_cdata", data, encoding, ts + 9, te - 3); - }; + cdata_start => start_cdata; # The start of an element. '<' => start_element; diff --git a/spec/oga/xml/lexer/cdata_spec.rb b/spec/oga/xml/lexer/cdata_spec.rb index 0e0887b..f926747 100644 --- a/spec/oga/xml/lexer/cdata_spec.rb +++ b/spec/oga/xml/lexer/cdata_spec.rb @@ -13,5 +13,18 @@ describe Oga::XML::Lexer do example 'lex double brackets inside a CDATA tag' do lex('').should == [[:T_CDATA, ']]', 1]] end + + example 'lex two CDATA tags following each other' do + lex('').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'a', 1], + [:T_CDATA, 'foo', 1], + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'b', 1], + [:T_CDATA, 'bar', 1], + [:T_ELEM_END, nil, 1], + [:T_ELEM_END, nil, 1] + ] + end end end