Lex CDATA tags in chunks

Instead of using a single token (T_CDATA) for a CDATA tag the lexer now uses 3 tokens: 1. T_CDATA_START 2. T_CDATA_BODY 3. T_CDATA_END The T_CDATA_BODY token can occur multiple times and is turned into a single value in the XML parser. This is similar to the way strings are lexed. By changing the way CDATA tags are lexed Oga can now lex CDATA tags containing newlines when using an IO as input. For example, this would previously fail: Oga.parse_xml(StringIO.new("<![CDATA[\nfoo]]>")) Because IO input reads input per line the input for the lexer would be as following: "<![CDATA[\n" "foo]]>" Related issues: #93
2015-04-14 22:45:55 +02:00 · 2015-04-14 22:45:55 +02:00 · 8acc7fc743
parent 739e3b474c
commit 8acc7fc743
8 changed files with 153 additions and 18 deletions
--- a/ext/c/lexer.rl
+++ b/ext/c/lexer.rl
@ -90,7 +90,9 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
    ID id_advance_line        = rb_intern("advance_line");
    ID id_on_attribute        = rb_intern("on_attribute");
    ID id_on_attribute_ns     = rb_intern("on_attribute_ns");
-    ID id_on_cdata            = rb_intern("on_cdata");
+    ID id_on_cdata_start      = rb_intern("on_cdata_start");
+    ID id_on_cdata_body       = rb_intern("on_cdata_body");
+    ID id_on_cdata_end        = rb_intern("on_cdata_end");
    ID id_on_comment          = rb_intern("on_comment");
    ID id_on_doctype_end      = rb_intern("on_doctype_end");
    ID id_on_doctype_inline   = rb_intern("on_doctype_inline");
--- a/ext/java/org/liboga/xml/Lexer.rl
+++ b/ext/java/org/liboga/xml/Lexer.rl
@ -104,7 +104,9 @@ public class Lexer extends RubyObject
        String id_advance_line        = "advance_line";
        String id_on_attribute        = "on_attribute";
        String id_on_attribute_ns     = "on_attribute_ns";
-        String id_on_cdata            = "on_cdata";
+        String id_on_cdata_start      = "on_cdata_start";
+        String id_on_cdata_body       = "on_cdata_body";
+        String id_on_cdata_end        = "on_cdata_end";
        String id_on_comment          = "on_comment";
        String id_on_doctype_end      = "on_doctype_end";
        String id_on_doctype_inline   = "on_doctype_inline";
--- a/ext/ragel/base_lexer.rl
+++ b/ext/ragel/base_lexer.rl
@ -83,12 +83,35 @@

    cdata_start = '<![CDATA[';
    cdata_end   = ']]>';
-    cdata       = cdata_start (any* -- cdata_end) cdata_end;
+
+    # Everything except "]" OR a single "]"
+    cdata_allowed = (^']'+ | ']') $count_newlines;

    action start_cdata {
-        callback(id_on_cdata, data, encoding, ts + 9, te - 3);
+        callback_simple(id_on_cdata_start);
+
+        fnext cdata_body;
    }

+    cdata_body := |*
+        cdata_allowed => {
+            callback(id_on_cdata_body, data, encoding, ts, te);
+
+            if ( lines > 0 )
+            {
+                advance_line(lines);
+
+                lines = 0;
+            }
+        };
+
+        cdata_end => {
+            callback_simple(id_on_cdata_end);
+
+            fnext main;
+        };
+    *|;
+
    # Processing Instructions
    #
    # http://www.w3.org/TR/xpath/#section-Processing-Instruction-Nodes
@ -439,7 +462,7 @@
        doctype_start  => start_doctype;
        xml_decl_start => start_xml_decl;
        comment        => start_comment;
-        cdata          => start_cdata;
+        cdata_start    => start_cdata;
        proc_ins_start => start_proc_ins;
        element_start  => start_element;
        element_end    => close_element;
--- a/lib/oga/xml/lexer.rb
+++ b/lib/oga/xml/lexer.rb
@ -262,10 +262,26 @@ module Oga
      end

      ##
-      # Called on a CDATA tag.
+      # Called on the open CDATA tag.
      #
-      def on_cdata(value)
-        add_token(:T_CDATA, value)
+      def on_cdata_start
+        add_token(:T_CDATA_START)
+      end
+
+      ##
+      # Called on the closing CDATA tag.
+      #
+      def on_cdata_end
+        add_token(:T_CDATA_END)
+      end
+
+      ##
+      # Called for the body of a CDATA tag.
+      #
+      # @param [String] value
+      #
+      def on_cdata_body(value)
+        add_token(:T_CDATA_BODY, value)
      end

      ##
--- a/lib/oga/xml/parser.rll
+++ b/lib/oga/xml/parser.rll
@ -24,7 +24,8 @@

 %terminals T_TEXT T_STRING_SQUOTE T_STRING_DQUOTE T_STRING_BODY;
 %terminals T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME;
-%terminals T_DOCTYPE_INLINE T_CDATA T_COMMENT;
+%terminals T_DOCTYPE_INLINE T_COMMENT;
+%terminals T_CDATA_START T_CDATA_BODY T_CDATA_END;
 %terminals T_ELEM_START T_ELEM_NAME T_ELEM_NS T_ELEM_END T_ATTR T_ATTR_NS;
 %terminals T_XML_DECL_START T_XML_DECL_END;
 %terminals T_PROC_INS_START T_PROC_INS_NAME T_PROC_INS_END;
@ -93,7 +94,12 @@ doctype_types
 # CDATA tags

 cdata
-  = T_CDATA { on_cdata(val[0]) }
+  = T_CDATA_START cdata_body T_CDATA_END { on_cdata(val[1]) }
+  ;
+
+cdata_body
+  = T_CDATA_BODY cdata_body { val[0] + val[1] }
+  | _                       { '' }
  ;

 # Comments
--- a/spec/oga/xml/lexer/cdata_spec.rb
+++ b/spec/oga/xml/lexer/cdata_spec.rb
@ -1,30 +1,107 @@
 require 'spec_helper'

 describe Oga::XML::Lexer do
-  describe 'cdata tags' do
-    it 'lexes a cdata tag' do
-      lex('<![CDATA[foo]]>').should == [[:T_CDATA, 'foo', 1]]
+  describe 'CDATA tags' do
+    it 'lexes a CDATA tag' do
+      lex('<![CDATA[foo]]>').should == [
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, 'foo', 1],
+        [:T_CDATA_END, nil, 1]
+      ]
    end

    it 'lexes tags inside CDATA tags as regular text' do
-      lex('<![CDATA[<p>Foo</p>]]>').should == [[:T_CDATA, '<p>Foo</p>', 1]]
+      lex('<![CDATA[<p>Foo</p>]]>').should == [
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, '<p>Foo</p>', 1],
+        [:T_CDATA_END, nil, 1]
+      ]
+    end
+
+    it 'lexes a single bracket inside a CDATA tag' do
+      lex('<![CDATA[]]]>').should == [
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, ']', 1],
+        [:T_CDATA_END, nil, 1]
+      ]
    end

    it 'lexes double brackets inside a CDATA tag' do
-      lex('<![CDATA[]]]]>').should == [[:T_CDATA, ']]', 1]]
+      lex('<![CDATA[]]]]>').should == [
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, ']', 1],
+        [:T_CDATA_BODY, ']', 1],
+        [:T_CDATA_END, nil, 1]
+      ]
    end

    it 'lexes two CDATA tags following each other' do
      lex('<a><![CDATA[foo]]><b><![CDATA[bar]]></b></a>').should == [
        [:T_ELEM_START, nil, 1],
        [:T_ELEM_NAME, 'a', 1],
-        [:T_CDATA, 'foo', 1],
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, 'foo', 1],
+        [:T_CDATA_END, nil, 1],
        [:T_ELEM_START, nil, 1],
        [:T_ELEM_NAME, 'b', 1],
-        [:T_CDATA, 'bar', 1],
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, 'bar', 1],
+        [:T_CDATA_END, nil, 1],
        [:T_ELEM_END, nil, 1],
        [:T_ELEM_END, nil, 1]
      ]
    end
+
+    it 'lexes a CDATA tag containing a newline after the open tag' do
+      lex("<![CDATA[\nfoo]]>").should == [
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, "\nfoo", 1],
+        [:T_CDATA_END, nil, 2]
+      ]
+    end
+
+    it 'lexes a CDATA tag containing a newline before the closing tag' do
+      lex("<![CDATA[foo\n]]>").should == [
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, "foo\n", 1],
+        [:T_CDATA_END, nil, 2]
+      ]
+    end
+
+    it 'lexes a CDATA tag with the body surrounded by newlines' do
+      lex("<![CDATA[\nfoo\n]]>").should == [
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, "\nfoo\n", 1],
+        [:T_CDATA_END, nil, 3]
+      ]
+    end
+
+    describe 'using an IO as input' do
+      it 'lexes a CDATA tag containing a newline after the open tag' do
+        lex_stringio("<![CDATA[\nfoo]]>").should == [
+          [:T_CDATA_START, nil, 1],
+          [:T_CDATA_BODY, "\n", 1],
+          [:T_CDATA_BODY, "foo", 2],
+          [:T_CDATA_END, nil, 2]
+        ]
+      end
+
+      it 'lexes a CDATA tag containing a newline before the closing tag' do
+        lex_stringio("<![CDATA[foo\n]]>").should == [
+          [:T_CDATA_START, nil, 1],
+          [:T_CDATA_BODY, "foo\n", 1],
+          [:T_CDATA_END, nil, 2]
+        ]
+      end
+
+      it 'lexes a CDATA tag with the body surrounded by newlines' do
+        lex_stringio("<![CDATA[\nfoo\n]]>").should == [
+          [:T_CDATA_START, nil, 1],
+          [:T_CDATA_BODY, "\n", 1],
+          [:T_CDATA_BODY, "foo\n", 2],
+          [:T_CDATA_END, nil, 3]
+        ]
+      end
+    end
  end
 end
--- a/spec/oga/xml/lexer/inline_javascript_spec.rb
+++ b/spec/oga/xml/lexer/inline_javascript_spec.rb
@ -30,7 +30,9 @@ describe Oga::XML::Lexer do
        [:T_ELEM_START, nil, 1],
        [:T_ELEM_NAME, 'script', 1],
        [:T_TEXT, @javascript, 1],
-        [:T_CDATA, 'foo', 1],
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, 'foo', 1],
+        [:T_CDATA_END, nil, 1],
        [:T_ELEM_END, nil, 1]
      ]
    end
--- a/spec/support/parsing_helpers.rb
+++ b/spec/support/parsing_helpers.rb
@ -29,6 +29,13 @@ module Oga
      return Oga::XML::Lexer.new(input, options).lex
    end

+    ##
+    # @see [#lex]
+    #
+    def lex_stringio(input, options = {})
+      return lex(StringIO.new(input), options)
+    end
+
    ##
    # Lexes an XPath expression.
    #