From afbb5858122d5aece252b957b3988787ed76168f Mon Sep 17 00:00:00 2001
From: Yorick Peterse <yorickpeterse@gmail.com>
Date: Wed, 15 Apr 2015 01:23:46 +0200
Subject: [PATCH] Lexing support for unquoted HTML attribute values

This adds support for HTML such as:

    <a href=foo>HTML is a child of Satan itself</a>

Fixes #94
---
 ext/c/lexer.rl                             |  5 ++
 ext/java/org/liboga/xml/Lexer.rl           |  2 +
 ext/ragel/base_lexer.rl                    | 51 ++++++++++++++++++--
 spec/oga/xml/lexer/html_attributes_spec.rb | 54 ++++++++++++++++++++++
 spec/support/parsing_helpers.rb            |  7 +++
 5 files changed, 116 insertions(+), 3 deletions(-)
 create mode 100644 spec/oga/xml/lexer/html_attributes_spec.rb
diff --git a/ext/c/lexer.rl b/ext/c/lexer.rl
index 8513c32..c1bff37 100644
--- a/ext/c/lexer.rl
+++ b/ext/c/lexer.rl
@@ -24,6 +24,7 @@ on `ts` and `te`) so the macro ignores this argument.
 
 ID id_advance_line;
 ID id_literal_html_element_p;
+ID id_html;
 
 %%machine c_lexer;
 
@@ -75,6 +76,9 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
     OgaLexerState *state;
     int lines;
 
+    /* Whether or not HTML mode is enabled */
+    int html_p = rb_funcall(self, id_html, 0) == Qtrue;
+
     /* Make sure that all data passed back to Ruby has the proper encoding. */
     rb_encoding *encoding = rb_enc_get(data_block);
 
@@ -181,6 +185,7 @@ void Init_liboga_xml_lexer()
 
     id_advance_line           = rb_intern("advance_line");
     id_literal_html_element_p = rb_intern("literal_html_element?");
+    id_html                   = rb_intern("html");
 
     rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1);
     rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0);
diff --git a/ext/java/org/liboga/xml/Lexer.rl b/ext/java/org/liboga/xml/Lexer.rl
index e04679e..28ada04 100644
--- a/ext/java/org/liboga/xml/Lexer.rl
+++ b/ext/java/org/liboga/xml/Lexer.rl
@@ -89,6 +89,8 @@ public class Lexer extends RubyObject
     @JRubyMethod
     public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
     {
+        Boolean html_p = this.callMethod(context, "html").isTrue();
+
         Encoding encoding = rb_str.getEncoding();
 
         byte[] data = rb_str.getBytes();
diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl
index a27971e..8606e64 100644
--- a/ext/ragel/base_lexer.rl
+++ b/ext/ragel/base_lexer.rl
@@ -52,6 +52,11 @@
         if ( fc == '\n' ) lines++;
     }
 
+    action hold_and_return {
+        fhold;
+        fret;
+    }
+
     whitespace = [ \t];
     ident_char = [a-zA-Z0-9\-_];
     identifier = ident_char+;
@@ -370,10 +375,42 @@
         };
     *|;
 
+    # Characters that can be used for unquoted HTML attribute values.
+    # See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
+    # for more info.
+    html_unquoted_value = ^(
+        squote | dquote | '`' | '=' | '<' | '>' | whitespace | newline
+    )+;
+
+    # Machine used for processing HTML attribute values.
+    html_attribute_value := |*
+        squote => start_string_squote;
+        dquote => start_string_dquote;
+
+        # Unquoted attribute values are lexed as if they were single quoted
+        # strings.
+        html_unquoted_value => {
+            callback_simple(id_on_string_squote);
+
+            callback(id_on_string_body, data, encoding, ts, te);
+
+            callback_simple(id_on_string_squote);
+        };
+
+        any => hold_and_return;
+    *|;
+
+    # Machine used for processing XML attribute values.
+    xml_attribute_value := |*
+        squote => start_string_squote;
+        dquote => start_string_dquote;
+        any    => hold_and_return;
+    *|;
+
     # Machine used for processing the contents of an element's starting tag.
     # This includes the name, namespace and attributes.
     element_head := |*
-        whitespace | '=';
+        whitespace;
 
         newline => {
             callback_simple(id_advance_line);
@@ -389,8 +426,16 @@
         };
 
         # Attribute values.
-        squote => start_string_squote;
-        dquote => start_string_dquote;
+        '=' => {
+            if ( html_p )
+            {
+                fcall html_attribute_value;
+            }
+            else
+            {
+                fcall xml_attribute_value;
+            }
+        };
 
         # We're done with the open tag of the element.
         '>' => {
diff --git a/spec/oga/xml/lexer/html_attributes_spec.rb b/spec/oga/xml/lexer/html_attributes_spec.rb
new file mode 100644
index 0000000..41b79d4
--- /dev/null
+++ b/spec/oga/xml/lexer/html_attributes_spec.rb
@@ -0,0 +1,54 @@
+require 'spec_helper'
+
+describe Oga::XML::Lexer do
+  describe 'HTML attributes' do
+    it 'lexes an attribute with an unquoted value' do
+      lex_html('<a href=foo></a>').should == [
+        [:T_ELEM_START, nil, 1],
+        [:T_ELEM_NAME, 'a', 1],
+        [:T_ATTR, 'href', 1],
+        [:T_STRING_SQUOTE, nil, 1],
+        [:T_STRING_BODY, 'foo', 1],
+        [:T_STRING_SQUOTE, nil, 1],
+        [:T_ELEM_END, nil, 1]
+      ]
+    end
+
+    it 'lexes an attribute with an unquoted value containing a space' do
+      lex_html('<a href=foo bar></a>').should == [
+        [:T_ELEM_START, nil, 1],
+        [:T_ELEM_NAME, 'a', 1],
+        [:T_ATTR, 'href', 1],
+        [:T_STRING_SQUOTE, nil, 1],
+        [:T_STRING_BODY, 'foo', 1],
+        [:T_STRING_SQUOTE, nil, 1],
+        [:T_ATTR, 'bar', 1],
+        [:T_ELEM_END, nil, 1]
+      ]
+    end
+
+    it 'lexes an attribute with an unquoted value containing an underscore' do
+      lex_html('<a href=foo_bar></a>').should == [
+        [:T_ELEM_START, nil, 1],
+        [:T_ELEM_NAME, 'a', 1],
+        [:T_ATTR, 'href', 1],
+        [:T_STRING_SQUOTE, nil, 1],
+        [:T_STRING_BODY, 'foo_bar', 1],
+        [:T_STRING_SQUOTE, nil, 1],
+        [:T_ELEM_END, nil, 1]
+      ]
+    end
+
+    it 'lexes an attribute with an unquoted value containing a dash' do
+      lex_html('<a href=foo-bar></a>').should == [
+        [:T_ELEM_START, nil, 1],
+        [:T_ELEM_NAME, 'a', 1],
+        [:T_ATTR, 'href', 1],
+        [:T_STRING_SQUOTE, nil, 1],
+        [:T_STRING_BODY, 'foo-bar', 1],
+        [:T_STRING_SQUOTE, nil, 1],
+        [:T_ELEM_END, nil, 1]
+      ]
+    end
+  end
+end
diff --git a/spec/support/parsing_helpers.rb b/spec/support/parsing_helpers.rb
index a570dee..36f47e6 100644
--- a/spec/support/parsing_helpers.rb
+++ b/spec/support/parsing_helpers.rb
@@ -36,6 +36,13 @@ module Oga
       return lex(StringIO.new(input), options)
     end
 
+    ##
+    # @see [#lex]
+    #
+    def lex_html(input)
+      return Oga::XML::Lexer.new(input, :html => true).lex
+    end
+
     ##
     # Lexes an XPath expression.
     #