Basic lexing + parsing of doctypes.

We're doing these the lazy way. I can't be bothered writing patterns/rules for 4 different formats for something such as doctypes.
2014-02-27 01:27:43 +01:00 · 2014-02-27 01:27:43 +01:00 · 2c82f88f6c
parent d7d20b4c23
commit 2c82f88f6c
5 changed files with 94 additions and 20 deletions
--- a/lib/oga/lexer.rl
+++ b/lib/oga/lexer.rl
@ -83,35 +83,48 @@ module Oga
      lbracket = '[';
      rbracket = ']';

-      s_quote  = "'";
-      d_quote  = '"';
+      s_quote = "'";
+      d_quote = '"';

      # FIXME: there really should be a better way of doing this.
      text = (any - s_quote - d_quote - equals - bang - slash -
        greater - smaller - whitespace - newline - colon - dash -
        lbracket - rbracket)+;

-      # Unicode characters, taken from whitequark's wonderful parser library.
-      # (I honestly need to buy that dude a beer or 100). Basically this
-      # takes all characters and removes ASCII ones from the list, thus
-      # leaving you with Unicode.
-      unicode = any - ascii;
+      # DOCTYPES
+      #
+      # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
+      #
+      # Doctypes are treated with some extra care on lexer level to make the
+      # parser's life easier. If they were treated as regular text it would be
+      # a pain to specify a proper doctype in Racc since it can't match on a
+      # token's value (only on its type).
+      #
+      # Doctype parsing is also relaxed compared to the W3 specification. For
+      # example, the specification defines 4 doctype formats each having
+      # different rules. Because Oga doesn't really use the doctype for
+      # anything we'll just slap all the formats into a single rule. Easy
+      # enough.
+      doctype = smaller whitespace* bang whitespace* 'DOCTYPE'i whitespace*
+        'HTML'i whitespace* any* greater;

      main := |*
        whitespace => { t(:T_SPACE) };
        newline    => { t(:T_NEWLINE); advance_line };
-        smaller    => { t(:T_SMALLER) };
-        greater    => { t(:T_GREATER) };
-        slash      => { t(:T_SLASH) };
-        d_quote    => { t(:T_DQUOTE) };
-        s_quote    => { t(:T_SQUOTE) };
-        dash       => { t(:T_DASH) };
-        rbracket   => { t(:T_RBRACKET) };
-        lbracket   => { t(:T_LBRACKET) };
-        colon      => { t(:T_COLON) };
-        bang       => { t(:T_BANG) };
-        equals     => { t(:T_EQUALS) };
-        text       => { t(:T_TEXT) };
+
+        doctype  => { t(:T_DOCTYPE) };
+        smaller  => { t(:T_SMALLER) };
+        greater  => { t(:T_GREATER) };
+        slash    => { t(:T_SLASH) };
+        d_quote  => { t(:T_DQUOTE) };
+        s_quote  => { t(:T_SQUOTE) };
+        dash     => { t(:T_DASH) };
+        rbracket => { t(:T_RBRACKET) };
+        lbracket => { t(:T_LBRACKET) };
+        colon    => { t(:T_COLON) };
+        bang     => { t(:T_BANG) };
+        equals   => { t(:T_EQUALS) };
+        text     => { t(:T_TEXT) };
      *|;
    }%%
  end # Lexer
--- a/lib/oga/parser/html.y
+++ b/lib/oga/parser/html.y
@ -2,7 +2,7 @@ class Oga::Parser::HTML

 token T_SPACE T_NEWLINE T_SMALLER T_GREATER T_SLASH
 token T_DQUOTE T_SQUOTE T_DASH T_RBRACKET T_LBRACKET
-token T_COLON T_BANG T_EQUALS T_TEXT
+token T_COLON T_BANG T_EQUALS T_TEXT T_DOCTYPE

 options no_result_var

@ -19,8 +19,17 @@ rule

  expression
    : tag
+    | doctype
    ;

+  # Doctypes
+
+  doctype
+    : T_DOCTYPE { s(:doctype, val[0]) }
+    ;
+
+  # Generic HTML tags
+
  tag_start
    # <p>
    : T_SMALLER T_TEXT T_GREATER { val[1] }
@ -42,6 +51,16 @@ rule
  tag_body
    : T_TEXT
    ;
+
+  whitespaces
+    : whitespaces whitespace
+    | whitespace
+    ;
+
+  whitespace
+    : T_NEWLINE
+    | T_SPACE
+    ;
 end

 ---- inner
--- a/spec/oga/lexer_spec.rb
+++ b/spec/oga/lexer_spec.rb
@ -132,4 +132,18 @@ describe Oga::Lexer do
      ]
    end
  end
+
+  context 'doctypes' do
+    example 'lex the HTML5 doctype' do
+      lex('<!DOCTYPE html>').should == [
+        [:T_DOCTYPE, '<!DOCTYPE html>', 1, 1]
+      ]
+    end
+
+    example 'lex a random doctype' do
+      lex('<!DOCTYPE HTML PUBLIC "foobar" "baz">').should == [
+        [:T_DOCTYPE, '<!DOCTYPE HTML PUBLIC "foobar" "baz">', 1, 1]
+      ]
+    end
+  end
 end
--- a/spec/oga/parser/html/doctype_spec.rb
+++ b/spec/oga/parser/html/doctype_spec.rb
@ -0,0 +1,18 @@
+require 'spec_helper'
+
+describe Oga::Parser::HTML do
+  context 'doctypes' do
+    example 'parse the HTML5 doctype' do
+      doctype = '<!DOCTYPE html>'
+
+      parse_html(doctype).should == s( :document, s(:doctype, doctype))
+    end
+
+    example 'parse an HTML 4 strict doctype' do
+      doctype = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" ' \
+        '"http://www.w3.org/TR/html4/strict.dtd">'
+
+      parse_html(doctype).should == s(:document, s(:doctype, doctype))
+    end
+  end
+end
--- a/spec/support/parsing.rb
+++ b/spec/support/parsing.rb
@ -20,5 +20,15 @@ module Oga
    def lex(input)
      return Oga::Lexer.new.lex(input)
    end
+
+    ##
+    # Parses the given HTML and returns an AST.
+    #
+    # @param [String] input
+    # @return [Oga::AST::Node]
+    #
+    def parse_html(input)
+      return Oga::Parser::HTML.new.parse(input)
+    end
  end # ParsingHelpers
 end # Oga