Basic lexing of HTML tags.

The current implementation is a bit messy. In particular the counting of column numbers is not entirely the way it should be. There are also some problems with nested tags/text that I still have to resolve.
2014-03-03 22:08:46 +01:00 · 2014-03-03 22:08:46 +01:00 · a5a3b8db3f
parent d9ef33e1f8
commit a5a3b8db3f
4 changed files with 116 additions and 102 deletions
--- a/lib/oga/lexer.rl
+++ b/lib/oga/lexer.rl
@ -79,6 +79,8 @@ module Oga
    end

    def emit_text_buffer
+      return if @text_buffer.empty?
+
      add_token(:T_TEXT, @text_buffer)

      @text_buffer = ''
@ -98,12 +100,8 @@ module Oga
      newline    = '\n' | '\r\n';
      whitespace = [ \t];

-      action emit_space {
-        t(:T_SPACE)
-      }
-
      action emit_newline {
-        t(:T_NEWLINE)
+        t(:T_TEXT)
        advance_line
      }

@ -228,9 +226,66 @@ module Oga
        any => buffer_text;
      *|;

+      # Elements
+      #
+      # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
+      #
+      element_name  = [a-zA-Z0-9\-_]+;
+      element_start = '<' element_name;
+
+      # First emit the token, then advance the column. This way the column
+      # number points to the < and not the "p" in <p>.
+      action open_element {
+        t(:T_ELEM_OPEN, p)
+
+        advance_column
+
+        fcall element;
+      }
+
+      element_text := |*
+        ^'<' => buffer_text;
+
+        '<' => {
+          emit_text_buffer
+          fhold;
+          fret;
+        };
+      *|;
+
+      element := |*
+        whitespace => { advance_column };
+
+        element_start => open_element;
+
+        # Consume the text inside the element.
+        '>' => {
+          advance_column
+          fcall element_text;
+        };
+
+        # Attributes and their values.
+        element_name
+          %{
+            t(:T_ATTR, @ts, p)
+          }
+        '=' (dquote @string_dquote | squote @string_squote);
+
+        # Non self-closing tags.
+        '</' element_name {
+          emit_text_buffer
+          t(:T_ELEM_CLOSE, p)
+
+          # Advance by two to take the closing </ into account. This is done
+          # after emitting tokens to ensure that they point to the start of
+          # the tag.
+          advance_column(2)
+          fret;
+        };
+      *|;
+
      main := |*
-        whitespace => emit_space;
-        newline    => emit_newline;
+        newline => emit_newline;

        doctype_start => {
          t(:T_DOCTYPE_START)
@ -247,19 +302,10 @@ module Oga
          fcall comment;
        };

-        # General rules and actions.
-        '<' => { t(:T_SMALLER) };
-        '>' => { t(:T_GREATER) };
-        '/' => { t(:T_SLASH) };
-        '-' => { t(:T_DASH) };
-        ']' => { t(:T_RBRACKET) };
-        '[' => { t(:T_LBRACKET) };
-        ':' => { t(:T_COLON) };
-        '!' => { t(:T_BANG) };
-        '=' => { t(:T_EQUALS) };
+        element_start => open_element;

-        dquote => { t(:T_DQUOTE) };
-        squote => { t(:T_SQUOTE) };
+        #dquote => { t(:T_DQUOTE) };
+        #squote => { t(:T_SQUOTE) };
      *|;
    }%%
  end # Lexer
--- a/spec/oga/lexer/elements_spec.rb
+++ b/spec/oga/lexer/elements_spec.rb
@ -0,0 +1,47 @@
+require 'spec_helper'
+
+describe Oga::Lexer do
+  context 'elements' do
+    example 'lex an opening element' do
+      lex('<p>').should == [
+        [:T_ELEM_OPEN, 'p', 1, 1]
+      ]
+    end
+
+    example 'lex an opening an closing element' do
+      lex('<p></p>').should == [
+        [:T_ELEM_OPEN, 'p', 1, 1],
+        [:T_ELEM_CLOSE, 'p', 1, 4]
+      ]
+    end
+
+    example 'lex a paragraph element with text inside it' do
+      lex('<p>Hello</p>').should == [
+        [:T_ELEM_OPEN, 'p', 1, 1],
+        [:T_TEXT, 'Hello', 1, 4],
+        [:T_ELEM_CLOSE, 'p', 1, 9]
+      ]
+    end
+
+    example 'lex a paragraph element with attributes' do
+      lex('<p class="foo">Hello</p>').should == [
+        [:T_ELEM_OPEN, 'p', 1, 1],
+        [:T_ATTR, 'class', 1, 4],
+        [:T_STRING, 'foo', 1, 10],
+        [:T_TEXT, 'Hello', 1, 15],
+        [:T_ELEM_CLOSE, 'p', 1, 20]
+      ]
+    end
+  end
+
+  context 'nested elements' do
+    example 'lex a nested element' do
+      lex('<p><a></a></p>').should == [
+        [:T_ELEM_OPEN, 'p', 1, 1],
+        [:T_ELEM_OPEN, 'a', 1, 4],
+        [:T_ELEM_CLOSE, 'a', 1, 7],
+        [:T_ELEM_CLOSE, 'p', 1, 11]
+      ]
+    end
+  end
+end
--- a/spec/oga/lexer/general_spec.rb
+++ b/spec/oga/lexer/general_spec.rb
@ -9,24 +9,17 @@ describe Oga::Lexer do

  context 'whitespace' do
    example 'lex regular whitespace' do
-      lex(' ').should == [[:T_SPACE, ' ', 1, 1]]
+      lex(' ').should == [[:T_TEXT, ' ', 1, 1]]
    end

    example 'lex a newline' do
-      lex("\n").should == [[:T_NEWLINE, "\n", 1, 1]]
-    end
-
-    example 'advance column numbers for spaces' do
-      lex('  ').should == [
-        [:T_SPACE, ' ', 1, 1],
-        [:T_SPACE, ' ', 1, 2]
-      ]
+      lex("\n").should == [[:T_TEXT, "\n", 1, 1]]
    end

    example 'advance line numbers for newlines' do
      lex("\n ").should == [
-        [:T_NEWLINE, "\n", 1, 1],
-        [:T_SPACE, ' ', 2, 1]
+        [:T_TEXT, "\n", 1, 1],
+        [:T_TEXT, ' ', 2, 1]
      ]
    end
  end
--- a/spec/oga/lexer/tags_spec.rb
+++ b/spec/oga/lexer/tags_spec.rb
@ -1,72 +0,0 @@
-require 'spec_helper'
-
-describe Oga::Lexer do
-  context 'tags' do
-    example 'lex an opening tag' do
-      lex('<p>').should == [
-        [:T_SMALLER, '<', 1, 1],
-        [:T_TEXT, 'p', 1, 2],
-        [:T_GREATER, '>', 1, 3]
-      ]
-    end
-
-    example 'lex an opening tag with an attribute' do
-      lex('<p title="Foo">').should == [
-        [:T_SMALLER, '<', 1, 1],
-        [:T_TEXT, 'p', 1, 2],
-        [:T_SPACE, ' ', 1, 3],
-        [:T_TEXT, 'title', 1, 4],
-        [:T_EQUALS, '=', 1, 9],
-        [:T_DQUOTE, '"', 1, 10],
-        [:T_TEXT, 'Foo', 1, 11],
-        [:T_DQUOTE, '"', 1, 14],
-        [:T_GREATER, '>', 1, 15]
-      ]
-    end
-
-    example 'lex a tag with text inside it' do
-      lex('<p>Foo</p>').should == [
-        [:T_SMALLER, '<', 1, 1],
-        [:T_TEXT, 'p', 1, 2],
-        [:T_GREATER, '>', 1, 3],
-        [:T_TEXT, 'Foo', 1, 4],
-        [:T_SMALLER, '<', 1, 7],
-        [:T_SLASH, '/', 1, 8],
-        [:T_TEXT, 'p', 1, 9],
-        [:T_GREATER, '>', 1, 10]
-      ]
-    end
-
-    example 'lex a tag with an attribute with a dash in it' do
-      lex('<p foo-bar="baz">').should == [
-        [:T_SMALLER, '<', 1, 1],
-        [:T_TEXT, 'p', 1, 2],
-        [:T_SPACE, ' ', 1, 3],
-        [:T_TEXT, 'foo', 1, 4],
-        [:T_DASH, '-', 1, 7],
-        [:T_TEXT, 'bar', 1, 8],
-        [:T_EQUALS, '=', 1, 11],
-        [:T_DQUOTE, '"', 1, 12],
-        [:T_TEXT, 'baz', 1, 13],
-        [:T_DQUOTE, '"', 1, 16],
-        [:T_GREATER, '>', 1, 17]
-      ]
-    end
-  end
-
-  context 'tags with namespaces' do
-    example 'lex a tag with a dummy namespace' do
-      lex('<foo:p></p>').should == [
-        [:T_SMALLER, '<', 1, 1],
-        [:T_TEXT, 'foo', 1, 2],
-        [:T_COLON, ':', 1, 5],
-        [:T_TEXT, 'p', 1, 6],
-        [:T_GREATER, '>', 1, 7],
-        [:T_SMALLER, '<', 1, 8],
-        [:T_SLASH, '/', 1, 9],
-        [:T_TEXT, 'p', 1, 10],
-        [:T_GREATER, '>', 1, 11]
-      ]
-    end
-  end
-end