Basic lexing + parsing of doctypes.

We're doing these the lazy way. I can't be bothered writing patterns/rules for 4 different formats for something such as doctypes.
2014-02-27 01:27:43 +01:00 · 2014-02-27 01:27:43 +01:00 · 2c82f88f6c
parent d7d20b4c23
commit 2c82f88f6c
5 changed files with 94 additions and 20 deletions
--- a/lib/oga/lexer.rl
+++ b/lib/oga/lexer.rl
@ -83,35 +83,48 @@ module Oga
      lbracket = '[';
      rbracket = ']';
-      s_quote  = "'";
+      s_quote = "'";
-      d_quote  = '"';
+      d_quote = '"';
      # FIXME: there really should be a better way of doing this.
      text = (any - s_quote - d_quote - equals - bang - slash -
        greater - smaller - whitespace - newline - colon - dash -
        lbracket - rbracket)+;
-      # Unicode characters, taken from whitequark's wonderful parser library.
+      # DOCTYPES
-      # (I honestly need to buy that dude a beer or 100). Basically this
+      #
-      # takes all characters and removes ASCII ones from the list, thus
+      # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
-      # leaving you with Unicode.
+      #
-      unicode = any - ascii;
+      # Doctypes are treated with some extra care on lexer level to make the
      # parser's life easier. If they were treated as regular text it would be
      # a pain to specify a proper doctype in Racc since it can't match on a
      # token's value (only on its type).
      #
      # Doctype parsing is also relaxed compared to the W3 specification. For
      # example, the specification defines 4 doctype formats each having
      # different rules. Because Oga doesn't really use the doctype for
      # anything we'll just slap all the formats into a single rule. Easy
      # enough.
      doctype = smaller whitespace* bang whitespace* 'DOCTYPE'i whitespace*
        'HTML'i whitespace* any* greater;
      main := |*
        whitespace => { t(:T_SPACE) };
        newline    => { t(:T_NEWLINE); advance_line };
-        smaller    => { t(:T_SMALLER) };
+
-        greater    => { t(:T_GREATER) };
+        doctype  => { t(:T_DOCTYPE) };
-        slash      => { t(:T_SLASH) };
+        smaller  => { t(:T_SMALLER) };
-        d_quote    => { t(:T_DQUOTE) };
+        greater  => { t(:T_GREATER) };
-        s_quote    => { t(:T_SQUOTE) };
+        slash    => { t(:T_SLASH) };
-        dash       => { t(:T_DASH) };
+        d_quote  => { t(:T_DQUOTE) };
-        rbracket   => { t(:T_RBRACKET) };
+        s_quote  => { t(:T_SQUOTE) };
-        lbracket   => { t(:T_LBRACKET) };
+        dash     => { t(:T_DASH) };
-        colon      => { t(:T_COLON) };
+        rbracket => { t(:T_RBRACKET) };
-        bang       => { t(:T_BANG) };
+        lbracket => { t(:T_LBRACKET) };
-        equals     => { t(:T_EQUALS) };
+        colon    => { t(:T_COLON) };
-        text       => { t(:T_TEXT) };
+        bang     => { t(:T_BANG) };
        equals   => { t(:T_EQUALS) };
        text     => { t(:T_TEXT) };
      *|;
    }%%
  end # Lexer
--- a/lib/oga/parser/html.y
+++ b/lib/oga/parser/html.y
@ -2,7 +2,7 @@ class Oga::Parser::HTML
 token T_SPACE T_NEWLINE T_SMALLER T_GREATER T_SLASH
 token T_DQUOTE T_SQUOTE T_DASH T_RBRACKET T_LBRACKET
-token T_COLON T_BANG T_EQUALS T_TEXT
+token T_COLON T_BANG T_EQUALS T_TEXT T_DOCTYPE
 options no_result_var
@ -19,8 +19,17 @@ rule
  expression
    : tag
    | doctype
    ;
  # Doctypes
  doctype
    : T_DOCTYPE { s(:doctype, val[0]) }
    ;
  # Generic HTML tags
  tag_start
    # <p>
    : T_SMALLER T_TEXT T_GREATER { val[1] }
@ -42,6 +51,16 @@ rule
  tag_body
    : T_TEXT
    ;
  whitespaces
    : whitespaces whitespace
    | whitespace
    ;
  whitespace
    : T_NEWLINE
    | T_SPACE
    ;
 end
 ---- inner
--- a/spec/oga/lexer_spec.rb
+++ b/spec/oga/lexer_spec.rb
@ -132,4 +132,18 @@ describe Oga::Lexer do
      ]
    end
  end
  context 'doctypes' do
    example 'lex the HTML5 doctype' do
      lex('<!DOCTYPE html>').should == [
        [:T_DOCTYPE, '<!DOCTYPE html>', 1, 1]
      ]
    end
    example 'lex a random doctype' do
      lex('<!DOCTYPE HTML PUBLIC "foobar" "baz">').should == [
        [:T_DOCTYPE, '<!DOCTYPE HTML PUBLIC "foobar" "baz">', 1, 1]
      ]
    end
  end
 end
--- a/spec/oga/parser/html/doctype_spec.rb
+++ b/spec/oga/parser/html/doctype_spec.rb
@ -0,0 +1,18 @@
 require 'spec_helper'
 describe Oga::Parser::HTML do
  context 'doctypes' do
    example 'parse the HTML5 doctype' do
      doctype = '<!DOCTYPE html>'
      parse_html(doctype).should == s( :document, s(:doctype, doctype))
    end
    example 'parse an HTML 4 strict doctype' do
      doctype = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" ' \
        '"http://www.w3.org/TR/html4/strict.dtd">'
      parse_html(doctype).should == s(:document, s(:doctype, doctype))
    end
  end
 end
--- a/spec/support/parsing.rb
+++ b/spec/support/parsing.rb
@ -20,5 +20,15 @@ module Oga
    def lex(input)
      return Oga::Lexer.new.lex(input)
    end
    ##
    # Parses the given HTML and returns an AST.
    #
    # @param [String] input
    # @return [Oga::AST::Node]
    #
    def parse_html(input)
      return Oga::Parser::HTML.new.parse(input)
    end
  end # ParsingHelpers
 end # Oga