Lexing of comments.

This fixes #4.
2014-02-28 23:27:23 +01:00 · 2014-02-28 23:27:23 +01:00 · d9ef33e1f8
parent 92ae48f905
commit d9ef33e1f8
2 changed files with 48 additions and 17 deletions
--- a/lib/oga/lexer.rl
+++ b/lib/oga/lexer.rl
@ -197,14 +197,34 @@ module Oga
      cdata := |*
        cdata_end => {
          emit_text_buffer
          t(:T_CDATA_END)
          fret;
        };
-        # Consume everything else character by character and store it in a
+        any => buffer_text;
-        # separate buffer.
+      *|;
      # Comments
      #
      # http://www.w3.org/TR/html-markup/syntax.html#comments
      #
      # Comments are lexed into 3 parts: the start tag, the content and the end
      # tag.
      #
      # Unlike the W3 specification these rules *do* allow character sequences
      # such as `--` and `->`. Putting extra checks in for these sequences
      # would actually make the rules/actions more complex.
      #
      comment_start = '<!--';
      comment_end   = '-->';
      comment := |*
        comment_end => {
          emit_text_buffer
          t(:T_COMMENT_END)
          fret;
        };
        any => buffer_text;
      *|;
@ -214,17 +234,19 @@ module Oga
        doctype_start => {
          t(:T_DOCTYPE_START)
          fcall doctype;
        };
        # @cdata_buffer is used to store the content of the CDATA tag.
        cdata_start => {
          t(:T_CDATA_START)
          fcall cdata;
        };
        comment_start => {
          t(:T_COMMENT_START)
          fcall comment;
        };
        # General rules and actions.
        '<' => { t(:T_SMALLER) };
        '>' => { t(:T_GREATER) };
--- a/spec/oga/lexer/comments_spec.rb
+++ b/spec/oga/lexer/comments_spec.rb
@ -4,16 +4,25 @@ describe Oga::Lexer do
  context 'comments' do
    example 'lex a comment' do
      lex('<!-- foo -->').should == [
-        [:T_SMALLER, '<', 1, 1],
+        [:T_COMMENT_START, '<!--', 1, 1],
-        [:T_BANG, '!', 1, 2],
+        [:T_TEXT, ' foo ', 1, 5],
-        [:T_DASH, '-', 1, 3],
+        [:T_COMMENT_END, '-->', 1, 10]
-        [:T_DASH, '-', 1, 4],
+      ]
-        [:T_SPACE, ' ', 1, 5],
+    end
-        [:T_TEXT, 'foo', 1, 6],
+
-        [:T_SPACE, ' ', 1, 9],
+    example 'lex a comment containing --' do
-        [:T_DASH, '-', 1, 10],
+      lex('<!-- -- -->').should == [
-        [:T_DASH, '-', 1, 11],
+        [:T_COMMENT_START, '<!--', 1, 1],
-        [:T_GREATER, '>', 1, 12]
+        [:T_TEXT, ' -- ', 1, 5],
        [:T_COMMENT_END, '-->', 1, 9]
      ]
    end
    example 'lex a comment containing ->' do
      lex('<!-- -> -->').should == [
        [:T_COMMENT_START, '<!--', 1, 1],
        [:T_TEXT, ' -> ', 1, 5],
        [:T_COMMENT_END, '-->', 1, 9]
      ]
    end
  end