diff --git a/lib/oga/lexer.rl b/lib/oga/lexer.rl index 1329412..cb2351a 100644 --- a/lib/oga/lexer.rl +++ b/lib/oga/lexer.rl @@ -56,7 +56,16 @@ module Oga end def t(type, start = @ts, stop = @te) - value = @data[start...stop] + value = text(start, stop) + + add_token(type, value) + end + + def text(start = @ts, stop = @te) + return @data[start...stop] + end + + def add_token(type, value) token = [type, value, @line, @column] advance_column(value.length) @@ -109,43 +118,47 @@ module Oga 'HTML'i whitespace* any* greater; # CDATA + # + # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections + # + # CDATA tags are broken up into 3 parts: the start, the content and the + # end tag. + # + # In HTML CDATA tags have no meaning/are not supported. Oga does + # support them but treats their contents as plain text. + # cdata_start = smaller bang lbracket 'CDATA' lbracket; cdata_end = rbracket rbracket greater; + cdata := |* + cdata_start => { + t(:T_CDATA_START) + + @cdata_buffer = '' + }; + + cdata_end => { + add_token(:T_TEXT, @cdata_buffer) + @cdata_buffer = nil + + t(:T_CDATA_END) + + fgoto main; + }; + + # Consume everything else character by character and store it in a + # separate buffer. + any => { @cdata_buffer << text }; + *|; + main := |* whitespace => { t(:T_SPACE) }; newline => { t(:T_NEWLINE); advance_line }; doctype => { t(:T_DOCTYPE) }; - # CDATA - # - # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections - # - # CDATA tags are broken up into 3 parts: the start, the content and the - # end tag. - # - # In HTML CDATA tags have no meaning/are not supported. Oga does - # support them but treats their contents as plain text. - # - cdata_start - %{ - @cdata_start = p - t(:T_CDATA_START, @ts, p) - } - - # Consume everything except ], which is the start of the ending tag. - (any - rbracket)+ - %{ - t(:T_TEXT, @cdata_start, p) - - @cdata_start = nil - } - - cdata_end - >{ - t(:T_CDATA_END, p, pe) - }; + # Jump to the cdata machine right away without processing anything. + cdata_start >{ fhold; fgoto cdata; }; # General rules and actions. smaller => { t(:T_SMALLER) }; diff --git a/spec/oga/lexer/cdata_spec.rb b/spec/oga/lexer/cdata_spec.rb index 8a0f913..d70cef1 100644 --- a/spec/oga/lexer/cdata_spec.rb +++ b/spec/oga/lexer/cdata_spec.rb @@ -17,5 +17,13 @@ describe Oga::Lexer do [:T_CDATA_END, ']]>', 1, 20] ] end + + example 'lex double brackets inside a CDATA tag' do + lex('').should == [ + [:T_CDATA_START, '', 1, 12] + ] + end end end