Better lexing of CDATA tags.

This means the lexer is now capable of lexing CDATA tags that contain text such as ]].
2014-02-28 20:05:12 +01:00 · 2014-02-28 20:05:12 +01:00 · 2294bf19f4
parent 6138945d53
commit 2294bf19f4
2 changed files with 50 additions and 29 deletions
--- a/lib/oga/lexer.rl
+++ b/lib/oga/lexer.rl
@ -56,7 +56,16 @@ module Oga
    end

    def t(type, start = @ts, stop = @te)
-      value = @data[start...stop]
+      value = text(start, stop)
+
+      add_token(type, value)
+    end
+
+    def text(start = @ts, stop = @te)
+      return @data[start...stop]
+    end
+
+    def add_token(type, value)
      token = [type, value, @line, @column]

      advance_column(value.length)
@ -109,43 +118,47 @@ module Oga
        'HTML'i whitespace* any* greater;

      # CDATA
+      #
+      # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
+      #
+      # CDATA tags are broken up into 3 parts: the start, the content and the
+      # end tag.
+      #
+      # In HTML CDATA tags have no meaning/are not supported. Oga does
+      # support them but treats their contents as plain text.
+      #
      cdata_start = smaller bang lbracket 'CDATA' lbracket;
      cdata_end   = rbracket rbracket greater;

+      cdata := |*
+        cdata_start => {
+          t(:T_CDATA_START)
+
+          @cdata_buffer = ''
+        };
+
+        cdata_end => {
+          add_token(:T_TEXT, @cdata_buffer)
+          @cdata_buffer = nil
+
+          t(:T_CDATA_END)
+
+          fgoto main;
+        };
+
+        # Consume everything else character by character and store it in a
+        # separate buffer.
+        any => { @cdata_buffer << text };
+      *|;
+
      main := |*
        whitespace => { t(:T_SPACE) };
        newline    => { t(:T_NEWLINE); advance_line };

        doctype  => { t(:T_DOCTYPE) };

-        # CDATA
-        #
-        # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
-        #
-        # CDATA tags are broken up into 3 parts: the start, the content and the
-        # end tag.
-        #
-        # In HTML CDATA tags have no meaning/are not supported. Oga does
-        # support them but treats their contents as plain text.
-        #
-        cdata_start
-          %{
-            @cdata_start = p
-            t(:T_CDATA_START, @ts, p)
-          }
-
-        # Consume everything except ], which is the start of the ending tag.
-        (any - rbracket)+
-          %{
-            t(:T_TEXT, @cdata_start, p)
-
-            @cdata_start = nil
-          }
-
-        cdata_end
-          >{
-            t(:T_CDATA_END, p, pe)
-          };
+        # Jump to the cdata machine right away without processing anything.
+        cdata_start >{ fhold; fgoto cdata; };

        # General rules and actions.
        smaller  => { t(:T_SMALLER) };
--- a/spec/oga/lexer/cdata_spec.rb
+++ b/spec/oga/lexer/cdata_spec.rb
@ -17,5 +17,13 @@ describe Oga::Lexer do
        [:T_CDATA_END, ']]>', 1, 20]
      ]
    end
+
+    example 'lex double brackets inside a CDATA tag' do
+      lex('<![CDATA[]]]]>').should == [
+        [:T_CDATA_START, '<![CDATA[', 1, 1],
+        [:T_TEXT, ']]', 1, 10],
+        [:T_CDATA_END, ']]>', 1, 12]
+      ]
+    end
  end
 end