Lexing of doctypes.

This comes with various structural changes to the lexer as I'm slowly starting to get the hang of Ragel. Ragel is a beast but damn it's an awesome piece of software. Note that the doctype public/system IDs are lexed as T_STRING. The parser will figure out whether a ID is a public or system ID based on the order. This fixes #1
2014-02-28 23:08:55 +01:00 · 2014-02-28 23:08:55 +01:00 · ca6f422036
parent 3c825afee0
commit ca6f422036
2 changed files with 116 additions and 25 deletions
--- a/lib/oga/lexer.rl
+++ b/lib/oga/lexer.rl
@ -27,6 +27,8 @@ module Oga
      @ts     = nil
      @te     = nil
      @tokens = []
      @stack  = []
      @top    = 0
    end
    def lex(data)
@ -73,6 +75,13 @@ module Oga
      @tokens << token
    end
    def emit_string_buffer
      add_token(:T_STRING, @string_buffer)
      advance_column
      @string_buffer = nil
    end
    %%{
      # Use instance variables for `ts` and friends.
      access @;
@ -80,22 +89,85 @@ module Oga
      newline    = '\n' | '\r\n';
      whitespace = [ \t];
      action emit_space {
        t(:T_SPACE)
      }
      action emit_newline {
        t(:T_NEWLINE)
        advance_line
      }
      # String processing
      #
      # These actions/definitions can be used to process single and/or double
      # quoted strings (e.g. for tag attribute values).
      #
      # The string_dquote and string_squote machines should not be used
      # directly, instead the corresponding actions should be used.
      #
      dquote = '"';
      squote = "'";
      action buffer_string {
        @string_buffer ||= ''
        @string_buffer << text
      }
      action string_dquote {
        advance_column
        fcall string_dquote;
      }
      action string_squote {
        advance_column
        fcall string_squote;
      }
      string_dquote := |*
        ^dquote => buffer_string;
        dquote  => {
          emit_string_buffer
          fret;
        };
      *|;
      string_squote := |*
        ^squote => buffer_string;
        squote  => {
          emit_string_buffer
          fret;
        };
      *|;
      # DOCTYPES
      #
      # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
      #
-      # Doctypes are treated with some extra care on lexer level to make the
+      # These rules support the 3 flavours of doctypes:
      # parser's life easier. If they were treated as regular text it would be
      # a pain to specify a proper doctype in Racc since it can't match on a
      # token's value (only on its type).
      #
-      # Doctype parsing is also relaxed compared to the W3 specification. For
+      # 1. Normal doctypes, as introduced in the HTML5 specification.
-      # example, the specification defines 4 doctype formats each having
+      # 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
-      # different rules. Because Oga doesn't really use the doctype for
+      # 3. Legacy doctypes
-      # anything we'll just slap all the formats into a single rule. Easy
+      #
-      # enough.
+      doctype_start = '<!DOCTYPE'i whitespace+ 'HTML'i;
-      doctype = '<' whitespace* '!' whitespace* 'DOCTYPE'i whitespace*
+
-        'HTML'i whitespace* any* '>';
+      doctype := |*
        'PUBLIC' | 'SYSTEM' => { t(:T_DOCTYPE_TYPE) };
        # Lex the public/system IDs as regular strings.
        dquote => string_dquote;
        squote => string_squote;
        # Whitespace inside doctypes is ignored since there's no point in
        # including it.
        whitespace => { advance_column };
        '>' => {
          t(:T_DOCTYPE_END)
          fgoto main;
        };
      *|;
      # CDATA
      #
@ -111,12 +183,6 @@ module Oga
      cdata_end   = ']]>';
      cdata := |*
        cdata_start => {
          t(:T_CDATA_START)
          @cdata_buffer = ''
        };
        cdata_end => {
          add_token(:T_TEXT, @cdata_buffer)
          @cdata_buffer = nil
@ -132,13 +198,23 @@ module Oga
      *|;
      main := |*
-        whitespace => { t(:T_SPACE) };
+        whitespace => emit_space;
-        newline    => { t(:T_NEWLINE); advance_line };
+        newline    => emit_newline;
-        doctype  => { t(:T_DOCTYPE) };
+        doctype_start => {
          t(:T_DOCTYPE_START)
-        # Jump to the cdata machine right away without processing anything.
+          fgoto doctype;
-        cdata_start >{ fhold; fgoto cdata; };
+        };
        # @cdata_buffer is used to store the content of the CDATA tag.
        cdata_start => {
          t(:T_CDATA_START)
          @cdata_buffer = ''
          fgoto cdata;
        };
        # General rules and actions.
        '<' => { t(:T_SMALLER) };
--- a/spec/oga/lexer/doctype_spec.rb
+++ b/spec/oga/lexer/doctype_spec.rb
@ -4,13 +4,28 @@ describe Oga::Lexer do
  context 'doctypes' do
    example 'lex the HTML5 doctype' do
      lex('<!DOCTYPE html>').should == [
-        [:T_DOCTYPE, '<!DOCTYPE html>', 1, 1]
+        [:T_DOCTYPE_START, '<!DOCTYPE html', 1, 1],
        [:T_DOCTYPE_END, '>', 1, 15]
      ]
    end
-    example 'lex a random doctype' do
+    example 'lex a doctype with a public and system ID' do
      lex('<!DOCTYPE HTML PUBLIC "foobar" "baz">').should == [
-        [:T_DOCTYPE, '<!DOCTYPE HTML PUBLIC "foobar" "baz">', 1, 1]
+        [:T_DOCTYPE_START, '<!DOCTYPE HTML', 1, 1],
        [:T_DOCTYPE_TYPE, 'PUBLIC', 1, 16],
        [:T_STRING, 'foobar', 1, 24],
        [:T_STRING, 'baz', 1, 33],
        [:T_DOCTYPE_END, '>', 1, 37]
      ]
    end
    example 'lex a doctype with a public and system ID using single quotes' do
      lex("<!DOCTYPE HTML PUBLIC 'foobar' 'baz'>").should == [
        [:T_DOCTYPE_START, '<!DOCTYPE HTML', 1, 1],
        [:T_DOCTYPE_TYPE, 'PUBLIC', 1, 16],
        [:T_STRING, 'foobar', 1, 24],
        [:T_STRING, 'baz', 1, 33],
        [:T_DOCTYPE_END, '>', 1, 37]
      ]
    end
  end