diff --git a/lib/oga/lexer.rl b/lib/oga/lexer.rl index f3b1418..73a8411 100644 --- a/lib/oga/lexer.rl +++ b/lib/oga/lexer.rl @@ -197,14 +197,34 @@ module Oga cdata := |* cdata_end => { emit_text_buffer - t(:T_CDATA_END) - fret; }; - # Consume everything else character by character and store it in a - # separate buffer. + any => buffer_text; + *|; + + # Comments + # + # http://www.w3.org/TR/html-markup/syntax.html#comments + # + # Comments are lexed into 3 parts: the start tag, the content and the end + # tag. + # + # Unlike the W3 specification these rules *do* allow character sequences + # such as `--` and `->`. Putting extra checks in for these sequences + # would actually make the rules/actions more complex. + # + comment_start = ''; + + comment := |* + comment_end => { + emit_text_buffer + t(:T_COMMENT_END) + fret; + }; + any => buffer_text; *|; @@ -214,17 +234,19 @@ module Oga doctype_start => { t(:T_DOCTYPE_START) - fcall doctype; }; - # @cdata_buffer is used to store the content of the CDATA tag. cdata_start => { t(:T_CDATA_START) - fcall cdata; }; + comment_start => { + t(:T_COMMENT_START) + fcall comment; + }; + # General rules and actions. '<' => { t(:T_SMALLER) }; '>' => { t(:T_GREATER) }; diff --git a/spec/oga/lexer/comments_spec.rb b/spec/oga/lexer/comments_spec.rb index 537f8e5..cd2b2bf 100644 --- a/spec/oga/lexer/comments_spec.rb +++ b/spec/oga/lexer/comments_spec.rb @@ -4,16 +4,25 @@ describe Oga::Lexer do context 'comments' do example 'lex a comment' do lex('').should == [ - [:T_SMALLER, '<', 1, 1], - [:T_BANG, '!', 1, 2], - [:T_DASH, '-', 1, 3], - [:T_DASH, '-', 1, 4], - [:T_SPACE, ' ', 1, 5], - [:T_TEXT, 'foo', 1, 6], - [:T_SPACE, ' ', 1, 9], - [:T_DASH, '-', 1, 10], - [:T_DASH, '-', 1, 11], - [:T_GREATER, '>', 1, 12] + [:T_COMMENT_START, '', 1, 10] + ] + end + + example 'lex a comment containing --' do + lex('').should == [ + [:T_COMMENT_START, '', 1, 9] + ] + end + + example 'lex a comment containing ->' do + lex('').should == [ + [:T_COMMENT_START, '', 1, 9] ] end end