From ca6f4220360963c89edce2cb03396a74a5f6204e Mon Sep 17 00:00:00 2001
From: Yorick Peterse <yorickpeterse@gmail.com>
Date: Fri, 28 Feb 2014 23:08:55 +0100
Subject: [PATCH] Lexing of doctypes.

This comes with various structural changes to the lexer as I'm slowly starting
to get the hang of Ragel. Ragel is a beast but damn it's an awesome piece of
software.

Note that the doctype public/system IDs are lexed as T_STRING. The parser will
figure out whether a ID is a public or system ID based on the order.

This fixes #1
---
 lib/oga/lexer.rl               | 120 +++++++++++++++++++++++++++------
 spec/oga/lexer/doctype_spec.rb |  21 +++++-
 2 files changed, 116 insertions(+), 25 deletions(-)
diff --git a/lib/oga/lexer.rl b/lib/oga/lexer.rl
index 73491bb..4ab167b 100644
--- a/lib/oga/lexer.rl
+++ b/lib/oga/lexer.rl
@@ -27,6 +27,8 @@ module Oga
       @ts     = nil
       @te     = nil
       @tokens = []
+      @stack  = []
+      @top    = 0
     end
 
     def lex(data)
@@ -73,6 +75,13 @@ module Oga
       @tokens << token
     end
 
+    def emit_string_buffer
+      add_token(:T_STRING, @string_buffer)
+      advance_column
+
+      @string_buffer = nil
+    end
+
     %%{
       # Use instance variables for `ts` and friends.
       access @;
@@ -80,22 +89,85 @@ module Oga
       newline    = '\n' | '\r\n';
       whitespace = [ \t];
 
+      action emit_space {
+        t(:T_SPACE)
+      }
+
+      action emit_newline {
+        t(:T_NEWLINE)
+        advance_line
+      }
+
+      # String processing
+      #
+      # These actions/definitions can be used to process single and/or double
+      # quoted strings (e.g. for tag attribute values).
+      #
+      # The string_dquote and string_squote machines should not be used
+      # directly, instead the corresponding actions should be used.
+      #
+      dquote = '"';
+      squote = "'";
+
+      action buffer_string {
+        @string_buffer ||= ''
+        @string_buffer << text
+      }
+
+      action string_dquote {
+        advance_column
+        fcall string_dquote;
+      }
+
+      action string_squote {
+        advance_column
+        fcall string_squote;
+      }
+
+      string_dquote := |*
+        ^dquote => buffer_string;
+        dquote  => {
+          emit_string_buffer
+          fret;
+        };
+      *|;
+
+      string_squote := |*
+        ^squote => buffer_string;
+        squote  => {
+          emit_string_buffer
+          fret;
+        };
+      *|;
+
       # DOCTYPES
       #
       # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
       #
-      # Doctypes are treated with some extra care on lexer level to make the
-      # parser's life easier. If they were treated as regular text it would be
-      # a pain to specify a proper doctype in Racc since it can't match on a
-      # token's value (only on its type).
+      # These rules support the 3 flavours of doctypes:
       #
-      # Doctype parsing is also relaxed compared to the W3 specification. For
-      # example, the specification defines 4 doctype formats each having
-      # different rules. Because Oga doesn't really use the doctype for
-      # anything we'll just slap all the formats into a single rule. Easy
-      # enough.
-      doctype = '<' whitespace* '!' whitespace* 'DOCTYPE'i whitespace*
-        'HTML'i whitespace* any* '>';
+      # 1. Normal doctypes, as introduced in the HTML5 specification.
+      # 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
+      # 3. Legacy doctypes
+      #
+      doctype_start = '<!DOCTYPE'i whitespace+ 'HTML'i;
+
+      doctype := |*
+        'PUBLIC' | 'SYSTEM' => { t(:T_DOCTYPE_TYPE) };
+
+        # Lex the public/system IDs as regular strings.
+        dquote => string_dquote;
+        squote => string_squote;
+
+        # Whitespace inside doctypes is ignored since there's no point in
+        # including it.
+        whitespace => { advance_column };
+
+        '>' => {
+          t(:T_DOCTYPE_END)
+          fgoto main;
+        };
+      *|;
 
       # CDATA
       #
@@ -111,12 +183,6 @@ module Oga
       cdata_end   = ']]>';
 
       cdata := |*
-        cdata_start => {
-          t(:T_CDATA_START)
-
-          @cdata_buffer = ''
-        };
-
         cdata_end => {
           add_token(:T_TEXT, @cdata_buffer)
           @cdata_buffer = nil
@@ -132,13 +198,23 @@ module Oga
       *|;
 
       main := |*
-        whitespace => { t(:T_SPACE) };
-        newline    => { t(:T_NEWLINE); advance_line };
+        whitespace => emit_space;
+        newline    => emit_newline;
 
-        doctype  => { t(:T_DOCTYPE) };
+        doctype_start => {
+          t(:T_DOCTYPE_START)
 
-        # Jump to the cdata machine right away without processing anything.
-        cdata_start >{ fhold; fgoto cdata; };
+          fgoto doctype;
+        };
+
+        # @cdata_buffer is used to store the content of the CDATA tag.
+        cdata_start => {
+          t(:T_CDATA_START)
+
+          @cdata_buffer = ''
+
+          fgoto cdata;
+        };
 
         # General rules and actions.
         '<' => { t(:T_SMALLER) };
diff --git a/spec/oga/lexer/doctype_spec.rb b/spec/oga/lexer/doctype_spec.rb
index 7e430ad..67ea0bb 100644
--- a/spec/oga/lexer/doctype_spec.rb
+++ b/spec/oga/lexer/doctype_spec.rb
@@ -4,13 +4,28 @@ describe Oga::Lexer do
   context 'doctypes' do
     example 'lex the HTML5 doctype' do
       lex('<!DOCTYPE html>').should == [
-        [:T_DOCTYPE, '<!DOCTYPE html>', 1, 1]
+        [:T_DOCTYPE_START, '<!DOCTYPE html', 1, 1],
+        [:T_DOCTYPE_END, '>', 1, 15]
       ]
     end
 
-    example 'lex a random doctype' do
+    example 'lex a doctype with a public and system ID' do
       lex('<!DOCTYPE HTML PUBLIC "foobar" "baz">').should == [
-        [:T_DOCTYPE, '<!DOCTYPE HTML PUBLIC "foobar" "baz">', 1, 1]
+        [:T_DOCTYPE_START, '<!DOCTYPE HTML', 1, 1],
+        [:T_DOCTYPE_TYPE, 'PUBLIC', 1, 16],
+        [:T_STRING, 'foobar', 1, 24],
+        [:T_STRING, 'baz', 1, 33],
+        [:T_DOCTYPE_END, '>', 1, 37]
+      ]
+    end
+
+    example 'lex a doctype with a public and system ID using single quotes' do
+      lex("<!DOCTYPE HTML PUBLIC 'foobar' 'baz'>").should == [
+        [:T_DOCTYPE_START, '<!DOCTYPE HTML', 1, 1],
+        [:T_DOCTYPE_TYPE, 'PUBLIC', 1, 16],
+        [:T_STRING, 'foobar', 1, 24],
+        [:T_STRING, 'baz', 1, 33],
+        [:T_DOCTYPE_END, '>', 1, 37]
       ]
     end
   end