Lexing of CDATA tags.

2014-02-26 22:01:07 +01:00 · 2014-02-26 22:01:07 +01:00 · c4e0406ed9
parent 0a336e76d3
commit c4e0406ed9
2 changed files with 29 additions and 8 deletions
--- a/lib/oga/lexer.rl
+++ b/lib/oga/lexer.rl
@ -73,20 +73,23 @@ module Oga

      any_escaped = /\\./;

-      smaller = '<';
-      greater = '>';
-      slash   = '/';
-      bang    = '!';
-      equals  = '=';
-      colon   = ':';
-      dash    = '-';
+      smaller  = '<';
+      greater  = '>';
+      slash    = '/';
+      bang     = '!';
+      equals   = '=';
+      colon    = ':';
+      dash     = '-';
+      lbracket = '[';
+      rbracket = ']';

      s_quote  = "'";
      d_quote  = '"';

      # FIXME: there really should be a better way of doing this.
      text = (any - s_quote - d_quote - equals - bang - slash -
-        greater - smaller - whitespace - newline - colon - dash)+;
+        greater - smaller - whitespace - newline - colon - dash -
+        lbracket - rbracket)+;

      # Unicode characters, taken from whitequark's wonderful parser library.
      # (I honestly need to buy that dude a beer or 100). Basically this
@ -103,6 +106,8 @@ module Oga
        d_quote    => { t(:T_DQUOTE) };
        s_quote    => { t(:T_SQUOTE) };
        dash       => { t(:T_DASH) };
+        rbracket   => { t(:T_RBRACKET) };
+        lbracket   => { t(:T_LBRACKET) };
        colon      => { t(:T_COLON) };
        bang       => { t(:T_BANG) };
        equals     => { t(:T_EQUALS) };
--- a/spec/oga/lexer_spec.rb
+++ b/spec/oga/lexer_spec.rb
@ -116,4 +116,20 @@ describe Oga::Lexer do
      ]
    end
  end
+
+  context 'cdata tags' do
+    example 'lex a cdata tag' do
+      lex('<![CDATA[foo]]>').should == [
+        [:T_SMALLER, '<', 1, 1],
+        [:T_BANG, '!', 1, 2],
+        [:T_LBRACKET, '[', 1, 3],
+        [:T_TEXT, 'CDATA', 1, 4],
+        [:T_LBRACKET, '[', 1, 9],
+        [:T_TEXT, 'foo', 1, 10],
+        [:T_RBRACKET, ']', 1, 13],
+        [:T_RBRACKET, ']', 1, 14],
+        [:T_GREATER, '>', 1, 15],
+      ]
+    end
+  end
 end