From c4e0406ed9747fc3d2cfd56eee93cdf92059ef20 Mon Sep 17 00:00:00 2001
From: Yorick Peterse <yorickpeterse@gmail.com>
Date: Wed, 26 Feb 2014 22:01:07 +0100
Subject: [PATCH] Lexing of CDATA tags.

---
 lib/oga/lexer.rl       | 21 +++++++++++++--------
 spec/oga/lexer_spec.rb | 16 ++++++++++++++++
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/lib/oga/lexer.rl b/lib/oga/lexer.rl
index a24bbcd..0518f9e 100644
--- a/lib/oga/lexer.rl
+++ b/lib/oga/lexer.rl
@@ -73,20 +73,23 @@ module Oga
 
       any_escaped = /\\./;
 
-      smaller = '<';
-      greater = '>';
-      slash   = '/';
-      bang    = '!';
-      equals  = '=';
-      colon   = ':';
-      dash    = '-';
+      smaller  = '<';
+      greater  = '>';
+      slash    = '/';
+      bang     = '!';
+      equals   = '=';
+      colon    = ':';
+      dash     = '-';
+      lbracket = '[';
+      rbracket = ']';
 
       s_quote  = "'";
       d_quote  = '"';
 
       # FIXME: there really should be a better way of doing this.
       text = (any - s_quote - d_quote - equals - bang - slash -
-        greater - smaller - whitespace - newline - colon - dash)+;
+        greater - smaller - whitespace - newline - colon - dash -
+        lbracket - rbracket)+;
 
       # Unicode characters, taken from whitequark's wonderful parser library.
       # (I honestly need to buy that dude a beer or 100). Basically this
@@ -103,6 +106,8 @@ module Oga
         d_quote    => { t(:T_DQUOTE) };
         s_quote    => { t(:T_SQUOTE) };
         dash       => { t(:T_DASH) };
+        rbracket   => { t(:T_RBRACKET) };
+        lbracket   => { t(:T_LBRACKET) };
         colon      => { t(:T_COLON) };
         bang       => { t(:T_BANG) };
         equals     => { t(:T_EQUALS) };
diff --git a/spec/oga/lexer_spec.rb b/spec/oga/lexer_spec.rb
index 40e0548..ca99113 100644
--- a/spec/oga/lexer_spec.rb
+++ b/spec/oga/lexer_spec.rb
@@ -116,4 +116,20 @@ describe Oga::Lexer do
       ]
     end
   end
+
+  context 'cdata tags' do
+    example 'lex a cdata tag' do
+      lex('<![CDATA[foo]]>').should == [
+        [:T_SMALLER, '<', 1, 1],
+        [:T_BANG, '!', 1, 2],
+        [:T_LBRACKET, '[', 1, 3],
+        [:T_TEXT, 'CDATA', 1, 4],
+        [:T_LBRACKET, '[', 1, 9],
+        [:T_TEXT, 'foo', 1, 10],
+        [:T_RBRACKET, ']', 1, 13],
+        [:T_RBRACKET, ']', 1, 14],
+        [:T_GREATER, '>', 1, 15],
+      ]
+    end
+  end
 end