Better lexing of CDATA tags.

This means the lexer is now capable of lexing CDATA tags that contain text such as ]].
2014-02-28 20:05:12 +01:00 · 2014-02-28 20:05:12 +01:00 · 2294bf19f4
parent 6138945d53
commit 2294bf19f4
2 changed files with 50 additions and 29 deletions
--- a/lib/oga/lexer.rl
+++ b/lib/oga/lexer.rl
@ -56,7 +56,16 @@ module Oga
    end
    def t(type, start = @ts, stop = @te)
-      value = @data[start...stop]
+      value = text(start, stop)
      add_token(type, value)
    end
    def text(start = @ts, stop = @te)
      return @data[start...stop]
    end
    def add_token(type, value)
      token = [type, value, @line, @column]
      advance_column(value.length)
@ -108,16 +117,6 @@ module Oga
      doctype = smaller whitespace* bang whitespace* 'DOCTYPE'i whitespace*
        'HTML'i whitespace* any* greater;
      # CDATA
      cdata_start = smaller bang lbracket 'CDATA' lbracket;
      cdata_end   = rbracket rbracket greater;
      main := |*
        whitespace => { t(:T_SPACE) };
        newline    => { t(:T_NEWLINE); advance_line };
        doctype  => { t(:T_DOCTYPE) };
      # CDATA
      #
      # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
@ -128,25 +127,39 @@ module Oga
      # In HTML CDATA tags have no meaning/are not supported. Oga does
      # support them but treats their contents as plain text.
      #
-        cdata_start
+      cdata_start = smaller bang lbracket 'CDATA' lbracket;
-          %{
+      cdata_end   = rbracket rbracket greater;
            @cdata_start = p
            t(:T_CDATA_START, @ts, p)
          }
-        # Consume everything except ], which is the start of the ending tag.
+      cdata := |*
-        (any - rbracket)+
+        cdata_start => {
-          %{
+          t(:T_CDATA_START)
            t(:T_TEXT, @cdata_start, p)
-            @cdata_start = nil
+          @cdata_buffer = ''
          }
        cdata_end
          >{
            t(:T_CDATA_END, p, pe)
        };
        cdata_end => {
          add_token(:T_TEXT, @cdata_buffer)
          @cdata_buffer = nil
          t(:T_CDATA_END)
          fgoto main;
        };
        # Consume everything else character by character and store it in a
        # separate buffer.
        any => { @cdata_buffer << text };
      *|;
      main := |*
        whitespace => { t(:T_SPACE) };
        newline    => { t(:T_NEWLINE); advance_line };
        doctype  => { t(:T_DOCTYPE) };
        # Jump to the cdata machine right away without processing anything.
        cdata_start >{ fhold; fgoto cdata; };
        # General rules and actions.
        smaller  => { t(:T_SMALLER) };
        greater  => { t(:T_GREATER) };
--- a/spec/oga/lexer/cdata_spec.rb
+++ b/spec/oga/lexer/cdata_spec.rb
@ -17,5 +17,13 @@ describe Oga::Lexer do
        [:T_CDATA_END, ']]>', 1, 20]
      ]
    end
    example 'lex double brackets inside a CDATA tag' do
      lex('<![CDATA[]]]]>').should == [
        [:T_CDATA_START, '<![CDATA[', 1, 1],
        [:T_TEXT, ']]', 1, 10],
        [:T_CDATA_END, ']]>', 1, 12]
      ]
    end
  end
 end