From 8acc7fc743c9492eed2d9c885c22c1b5bec06d0f Mon Sep 17 00:00:00 2001
From: Yorick Peterse <yorickpeterse@gmail.com>
Date: Tue, 14 Apr 2015 22:45:55 +0200
Subject: [PATCH] Lex CDATA tags in chunks

Instead of using a single token (T_CDATA) for a CDATA tag the lexer now
uses 3 tokens:

1. T_CDATA_START
2. T_CDATA_BODY
3. T_CDATA_END

The T_CDATA_BODY token can occur multiple times and is turned into a
single value in the XML parser. This is similar to the way strings are
lexed.

By changing the way CDATA tags are lexed Oga can now lex CDATA tags
containing newlines when using an IO as input. For example, this would
previously fail:

    Oga.parse_xml(StringIO.new("<![CDATA[\nfoo]]>"))

Because IO input reads input per line the input for the lexer would be
as following:

    "<![CDATA[\n"
    "foo]]>"

Related issues: #93
---
 ext/c/lexer.rl                               |  4 +-
 ext/java/org/liboga/xml/Lexer.rl             |  4 +-
 ext/ragel/base_lexer.rl                      | 29 ++++++-
 lib/oga/xml/lexer.rb                         | 22 ++++-
 lib/oga/xml/parser.rll                       | 10 ++-
 spec/oga/xml/lexer/cdata_spec.rb             | 91 ++++++++++++++++++--
 spec/oga/xml/lexer/inline_javascript_spec.rb |  4 +-
 spec/support/parsing_helpers.rb              |  7 ++
 8 files changed, 153 insertions(+), 18 deletions(-)

diff --git a/ext/c/lexer.rl b/ext/c/lexer.rl
index 2fdf16b..db0037d 100644
--- a/ext/c/lexer.rl
+++ b/ext/c/lexer.rl
@@ -90,7 +90,9 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
     ID id_advance_line        = rb_intern("advance_line");
     ID id_on_attribute        = rb_intern("on_attribute");
     ID id_on_attribute_ns     = rb_intern("on_attribute_ns");
-    ID id_on_cdata            = rb_intern("on_cdata");
+    ID id_on_cdata_start      = rb_intern("on_cdata_start");
+    ID id_on_cdata_body       = rb_intern("on_cdata_body");
+    ID id_on_cdata_end        = rb_intern("on_cdata_end");
     ID id_on_comment          = rb_intern("on_comment");
     ID id_on_doctype_end      = rb_intern("on_doctype_end");
     ID id_on_doctype_inline   = rb_intern("on_doctype_inline");
diff --git a/ext/java/org/liboga/xml/Lexer.rl b/ext/java/org/liboga/xml/Lexer.rl
index ae3a6eb..738ce85 100644
--- a/ext/java/org/liboga/xml/Lexer.rl
+++ b/ext/java/org/liboga/xml/Lexer.rl
@@ -104,7 +104,9 @@ public class Lexer extends RubyObject
         String id_advance_line        = "advance_line";
         String id_on_attribute        = "on_attribute";
         String id_on_attribute_ns     = "on_attribute_ns";
-        String id_on_cdata            = "on_cdata";
+        String id_on_cdata_start      = "on_cdata_start";
+        String id_on_cdata_body       = "on_cdata_body";
+        String id_on_cdata_end        = "on_cdata_end";
         String id_on_comment          = "on_comment";
         String id_on_doctype_end      = "on_doctype_end";
         String id_on_doctype_inline   = "on_doctype_inline";
diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl
index a04140f..a0721e7 100644
--- a/ext/ragel/base_lexer.rl
+++ b/ext/ragel/base_lexer.rl
@@ -83,12 +83,35 @@
 
     cdata_start = '<![CDATA[';
     cdata_end   = ']]>';
-    cdata       = cdata_start (any* -- cdata_end) cdata_end;
+
+    # Everything except "]" OR a single "]"
+    cdata_allowed = (^']'+ | ']') $count_newlines;
 
     action start_cdata {
-        callback(id_on_cdata, data, encoding, ts + 9, te - 3);
+        callback_simple(id_on_cdata_start);
+
+        fnext cdata_body;
     }
 
+    cdata_body := |*
+        cdata_allowed => {
+            callback(id_on_cdata_body, data, encoding, ts, te);
+
+            if ( lines > 0 )
+            {
+                advance_line(lines);
+
+                lines = 0;
+            }
+        };
+
+        cdata_end => {
+            callback_simple(id_on_cdata_end);
+
+            fnext main;
+        };
+    *|;
+
     # Processing Instructions
     #
     # http://www.w3.org/TR/xpath/#section-Processing-Instruction-Nodes
@@ -439,7 +462,7 @@
         doctype_start  => start_doctype;
         xml_decl_start => start_xml_decl;
         comment        => start_comment;
-        cdata          => start_cdata;
+        cdata_start    => start_cdata;
         proc_ins_start => start_proc_ins;
         element_start  => start_element;
         element_end    => close_element;
diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb
index c84e90f..d5f51d2 100644
--- a/lib/oga/xml/lexer.rb
+++ b/lib/oga/xml/lexer.rb
@@ -262,10 +262,26 @@ module Oga
       end
 
       ##
-      # Called on a CDATA tag.
+      # Called on the open CDATA tag.
       #
-      def on_cdata(value)
-        add_token(:T_CDATA, value)
+      def on_cdata_start
+        add_token(:T_CDATA_START)
+      end
+
+      ##
+      # Called on the closing CDATA tag.
+      #
+      def on_cdata_end
+        add_token(:T_CDATA_END)
+      end
+
+      ##
+      # Called for the body of a CDATA tag.
+      #
+      # @param [String] value
+      #
+      def on_cdata_body(value)
+        add_token(:T_CDATA_BODY, value)
       end
 
       ##
diff --git a/lib/oga/xml/parser.rll b/lib/oga/xml/parser.rll
index a676a49..fd33903 100644
--- a/lib/oga/xml/parser.rll
+++ b/lib/oga/xml/parser.rll
@@ -24,7 +24,8 @@
 
 %terminals T_TEXT T_STRING_SQUOTE T_STRING_DQUOTE T_STRING_BODY;
 %terminals T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME;
-%terminals T_DOCTYPE_INLINE T_CDATA T_COMMENT;
+%terminals T_DOCTYPE_INLINE T_COMMENT;
+%terminals T_CDATA_START T_CDATA_BODY T_CDATA_END;
 %terminals T_ELEM_START T_ELEM_NAME T_ELEM_NS T_ELEM_END T_ATTR T_ATTR_NS;
 %terminals T_XML_DECL_START T_XML_DECL_END;
 %terminals T_PROC_INS_START T_PROC_INS_NAME T_PROC_INS_END;
@@ -93,7 +94,12 @@ doctype_types
 # CDATA tags
 
 cdata
-  = T_CDATA { on_cdata(val[0]) }
+  = T_CDATA_START cdata_body T_CDATA_END { on_cdata(val[1]) }
+  ;
+
+cdata_body
+  = T_CDATA_BODY cdata_body { val[0] + val[1] }
+  | _                       { '' }
   ;
 
 # Comments
diff --git a/spec/oga/xml/lexer/cdata_spec.rb b/spec/oga/xml/lexer/cdata_spec.rb
index 82c304e..48f873d 100644
--- a/spec/oga/xml/lexer/cdata_spec.rb
+++ b/spec/oga/xml/lexer/cdata_spec.rb
@@ -1,30 +1,107 @@
 require 'spec_helper'
 
 describe Oga::XML::Lexer do
-  describe 'cdata tags' do
-    it 'lexes a cdata tag' do
-      lex('<![CDATA[foo]]>').should == [[:T_CDATA, 'foo', 1]]
+  describe 'CDATA tags' do
+    it 'lexes a CDATA tag' do
+      lex('<![CDATA[foo]]>').should == [
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, 'foo', 1],
+        [:T_CDATA_END, nil, 1]
+      ]
     end
 
     it 'lexes tags inside CDATA tags as regular text' do
-      lex('<![CDATA[<p>Foo</p>]]>').should == [[:T_CDATA, '<p>Foo</p>', 1]]
+      lex('<![CDATA[<p>Foo</p>]]>').should == [
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, '<p>Foo</p>', 1],
+        [:T_CDATA_END, nil, 1]
+      ]
+    end
+
+    it 'lexes a single bracket inside a CDATA tag' do
+      lex('<![CDATA[]]]>').should == [
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, ']', 1],
+        [:T_CDATA_END, nil, 1]
+      ]
     end
 
     it 'lexes double brackets inside a CDATA tag' do
-      lex('<![CDATA[]]]]>').should == [[:T_CDATA, ']]', 1]]
+      lex('<![CDATA[]]]]>').should == [
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, ']', 1],
+        [:T_CDATA_BODY, ']', 1],
+        [:T_CDATA_END, nil, 1]
+      ]
     end
 
     it 'lexes two CDATA tags following each other' do
       lex('<a><![CDATA[foo]]><b><![CDATA[bar]]></b></a>').should == [
         [:T_ELEM_START, nil, 1],
         [:T_ELEM_NAME, 'a', 1],
-        [:T_CDATA, 'foo', 1],
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, 'foo', 1],
+        [:T_CDATA_END, nil, 1],
         [:T_ELEM_START, nil, 1],
         [:T_ELEM_NAME, 'b', 1],
-        [:T_CDATA, 'bar', 1],
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, 'bar', 1],
+        [:T_CDATA_END, nil, 1],
         [:T_ELEM_END, nil, 1],
         [:T_ELEM_END, nil, 1]
       ]
     end
+
+    it 'lexes a CDATA tag containing a newline after the open tag' do
+      lex("<![CDATA[\nfoo]]>").should == [
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, "\nfoo", 1],
+        [:T_CDATA_END, nil, 2]
+      ]
+    end
+
+    it 'lexes a CDATA tag containing a newline before the closing tag' do
+      lex("<![CDATA[foo\n]]>").should == [
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, "foo\n", 1],
+        [:T_CDATA_END, nil, 2]
+      ]
+    end
+
+    it 'lexes a CDATA tag with the body surrounded by newlines' do
+      lex("<![CDATA[\nfoo\n]]>").should == [
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, "\nfoo\n", 1],
+        [:T_CDATA_END, nil, 3]
+      ]
+    end
+
+    describe 'using an IO as input' do
+      it 'lexes a CDATA tag containing a newline after the open tag' do
+        lex_stringio("<![CDATA[\nfoo]]>").should == [
+          [:T_CDATA_START, nil, 1],
+          [:T_CDATA_BODY, "\n", 1],
+          [:T_CDATA_BODY, "foo", 2],
+          [:T_CDATA_END, nil, 2]
+        ]
+      end
+
+      it 'lexes a CDATA tag containing a newline before the closing tag' do
+        lex_stringio("<![CDATA[foo\n]]>").should == [
+          [:T_CDATA_START, nil, 1],
+          [:T_CDATA_BODY, "foo\n", 1],
+          [:T_CDATA_END, nil, 2]
+        ]
+      end
+
+      it 'lexes a CDATA tag with the body surrounded by newlines' do
+        lex_stringio("<![CDATA[\nfoo\n]]>").should == [
+          [:T_CDATA_START, nil, 1],
+          [:T_CDATA_BODY, "\n", 1],
+          [:T_CDATA_BODY, "foo\n", 2],
+          [:T_CDATA_END, nil, 3]
+        ]
+      end
+    end
   end
 end
diff --git a/spec/oga/xml/lexer/inline_javascript_spec.rb b/spec/oga/xml/lexer/inline_javascript_spec.rb
index 9ffc944..e1d7295 100644
--- a/spec/oga/xml/lexer/inline_javascript_spec.rb
+++ b/spec/oga/xml/lexer/inline_javascript_spec.rb
@@ -30,7 +30,9 @@ describe Oga::XML::Lexer do
         [:T_ELEM_START, nil, 1],
         [:T_ELEM_NAME, 'script', 1],
         [:T_TEXT, @javascript, 1],
-        [:T_CDATA, 'foo', 1],
+        [:T_CDATA_START, nil, 1],
+        [:T_CDATA_BODY, 'foo', 1],
+        [:T_CDATA_END, nil, 1],
         [:T_ELEM_END, nil, 1]
       ]
     end
diff --git a/spec/support/parsing_helpers.rb b/spec/support/parsing_helpers.rb
index d5f07d6..a570dee 100644
--- a/spec/support/parsing_helpers.rb
+++ b/spec/support/parsing_helpers.rb
@@ -29,6 +29,13 @@ module Oga
       return Oga::XML::Lexer.new(input, options).lex
     end
 
+    ##
+    # @see [#lex]
+    #
+    def lex_stringio(input, options = {})
+      return lex(StringIO.new(input), options)
+    end
+
     ##
     # Lexes an XPath expression.
     #