Lexing of CDATA tags.

2014-02-28 00:03:37 +01:00 · 2014-02-28 00:03:37 +01:00 · 4883ac7384
parent c011e2faaa
commit 4883ac7384
2 changed files with 49 additions and 8 deletions
--- a/lib/oga/lexer.rl
+++ b/lib/oga/lexer.rl
@ -108,11 +108,50 @@ module Oga
      doctype = smaller whitespace* bang whitespace* 'DOCTYPE'i whitespace*
        'HTML'i whitespace* any* greater;

+      # CDATA
+      #
+      # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
+      #
+      # CDATA tags are broken up into 3 parts: the start, the content and the
+      # end tag.
+      #
+      # In HTML CDATA tags have no meaning/are not supported. Oga does support
+      # them but treats their contents as plain text.
+      #
+      cdata_start = smaller bang lbracket 'CDATA' lbracket;
+      cdata_end   = rbracket rbracket greater;
+
      main := |*
        whitespace => { t(:T_SPACE) };
        newline    => { t(:T_NEWLINE); advance_line };

        doctype  => { t(:T_DOCTYPE) };
+
+        # CDATA
+        #
+        # When processing CDATA patterns we'll emit tokens for the start tag,
+        # the content and the end tag.
+        #
+        cdata_start
+          %{
+            @cdata_start = p
+            t(:T_CDATA_START, @ts, p)
+          }
+
+        # Consume everything except ], which is the start of the ending tag.
+        (any - rbracket)+
+          %{
+            t(:T_TEXT, @cdata_start, p)
+
+            @cdata_start = nil
+          }
+
+        cdata_end
+          >{
+            t(:T_CDATA_END, p, pe)
+          };
+
+        # General rules and actions.
        smaller  => { t(:T_SMALLER) };
        greater  => { t(:T_GREATER) };
        slash    => { t(:T_SLASH) };
--- a/spec/oga/lexer/cdata_spec.rb
+++ b/spec/oga/lexer/cdata_spec.rb
@ -4,15 +4,17 @@ describe Oga::Lexer do
  context 'cdata tags' do
    example 'lex a cdata tag' do
      lex('<![CDATA[foo]]>').should == [
-        [:T_SMALLER, '<', 1, 1],
-        [:T_BANG, '!', 1, 2],
-        [:T_LBRACKET, '[', 1, 3],
-        [:T_TEXT, 'CDATA', 1, 4],
-        [:T_LBRACKET, '[', 1, 9],
+        [:T_CDATA_START, '<![CDATA[', 1, 1],
        [:T_TEXT, 'foo', 1, 10],
-        [:T_RBRACKET, ']', 1, 13],
-        [:T_RBRACKET, ']', 1, 14],
-        [:T_GREATER, '>', 1, 15],
+        [:T_CDATA_END, ']]>', 1, 13]
+      ]
+    end
+
+    example 'lex tags inside CDATA tags as regular text' do
+      lex('<![CDATA[<p>Foo</p>]]>').should == [
+        [:T_CDATA_START, '<![CDATA[', 1, 1],
+        [:T_TEXT, '<p>Foo</p>', 1, 10],
+        [:T_CDATA_END, ']]>', 1, 20]
      ]
    end
  end