diff --git a/ext/c/lexer.rl b/ext/c/lexer.rl
index 8513c32..c1bff37 100644
--- a/ext/c/lexer.rl
+++ b/ext/c/lexer.rl
@@ -24,6 +24,7 @@ on `ts` and `te`) so the macro ignores this argument.
ID id_advance_line;
ID id_literal_html_element_p;
+ID id_html;
%%machine c_lexer;
@@ -75,6 +76,9 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
OgaLexerState *state;
int lines;
+ /* Whether or not HTML mode is enabled */
+ int html_p = rb_funcall(self, id_html, 0) == Qtrue;
+
/* Make sure that all data passed back to Ruby has the proper encoding. */
rb_encoding *encoding = rb_enc_get(data_block);
@@ -181,6 +185,7 @@ void Init_liboga_xml_lexer()
id_advance_line = rb_intern("advance_line");
id_literal_html_element_p = rb_intern("literal_html_element?");
+ id_html = rb_intern("html");
rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1);
rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0);
diff --git a/ext/java/org/liboga/xml/Lexer.rl b/ext/java/org/liboga/xml/Lexer.rl
index e04679e..28ada04 100644
--- a/ext/java/org/liboga/xml/Lexer.rl
+++ b/ext/java/org/liboga/xml/Lexer.rl
@@ -89,6 +89,8 @@ public class Lexer extends RubyObject
@JRubyMethod
public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
{
+ Boolean html_p = this.callMethod(context, "html").isTrue();
+
Encoding encoding = rb_str.getEncoding();
byte[] data = rb_str.getBytes();
diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl
index a27971e..8606e64 100644
--- a/ext/ragel/base_lexer.rl
+++ b/ext/ragel/base_lexer.rl
@@ -52,6 +52,11 @@
if ( fc == '\n' ) lines++;
}
+ action hold_and_return {
+ fhold;
+ fret;
+ }
+
whitespace = [ \t];
ident_char = [a-zA-Z0-9\-_];
identifier = ident_char+;
@@ -370,10 +375,42 @@
};
*|;
+ # Characters that can be used for unquoted HTML attribute values.
+ # See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
+ # for more info.
+ html_unquoted_value = ^(
+ squote | dquote | '`' | '=' | '<' | '>' | whitespace | newline
+ )+;
+
+ # Machine used for processing HTML attribute values.
+ html_attribute_value := |*
+ squote => start_string_squote;
+ dquote => start_string_dquote;
+
+ # Unquoted attribute values are lexed as if they were single quoted
+ # strings.
+ html_unquoted_value => {
+ callback_simple(id_on_string_squote);
+
+ callback(id_on_string_body, data, encoding, ts, te);
+
+ callback_simple(id_on_string_squote);
+ };
+
+ any => hold_and_return;
+ *|;
+
+ # Machine used for processing XML attribute values.
+ xml_attribute_value := |*
+ squote => start_string_squote;
+ dquote => start_string_dquote;
+ any => hold_and_return;
+ *|;
+
# Machine used for processing the contents of an element's starting tag.
# This includes the name, namespace and attributes.
element_head := |*
- whitespace | '=';
+ whitespace;
newline => {
callback_simple(id_advance_line);
@@ -389,8 +426,16 @@
};
# Attribute values.
- squote => start_string_squote;
- dquote => start_string_dquote;
+ '=' => {
+ if ( html_p )
+ {
+ fcall html_attribute_value;
+ }
+ else
+ {
+ fcall xml_attribute_value;
+ }
+ };
# We're done with the open tag of the element.
'>' => {
diff --git a/spec/oga/xml/lexer/html_attributes_spec.rb b/spec/oga/xml/lexer/html_attributes_spec.rb
new file mode 100644
index 0000000..41b79d4
--- /dev/null
+++ b/spec/oga/xml/lexer/html_attributes_spec.rb
@@ -0,0 +1,54 @@
+require 'spec_helper'
+
+describe Oga::XML::Lexer do
+ describe 'HTML attributes' do
+ it 'lexes an attribute with an unquoted value' do
+ lex_html('').should == [
+ [:T_ELEM_START, nil, 1],
+ [:T_ELEM_NAME, 'a', 1],
+ [:T_ATTR, 'href', 1],
+ [:T_STRING_SQUOTE, nil, 1],
+ [:T_STRING_BODY, 'foo', 1],
+ [:T_STRING_SQUOTE, nil, 1],
+ [:T_ELEM_END, nil, 1]
+ ]
+ end
+
+ it 'lexes an attribute with an unquoted value containing a space' do
+ lex_html('').should == [
+ [:T_ELEM_START, nil, 1],
+ [:T_ELEM_NAME, 'a', 1],
+ [:T_ATTR, 'href', 1],
+ [:T_STRING_SQUOTE, nil, 1],
+ [:T_STRING_BODY, 'foo', 1],
+ [:T_STRING_SQUOTE, nil, 1],
+ [:T_ATTR, 'bar', 1],
+ [:T_ELEM_END, nil, 1]
+ ]
+ end
+
+ it 'lexes an attribute with an unquoted value containing an underscore' do
+ lex_html('').should == [
+ [:T_ELEM_START, nil, 1],
+ [:T_ELEM_NAME, 'a', 1],
+ [:T_ATTR, 'href', 1],
+ [:T_STRING_SQUOTE, nil, 1],
+ [:T_STRING_BODY, 'foo_bar', 1],
+ [:T_STRING_SQUOTE, nil, 1],
+ [:T_ELEM_END, nil, 1]
+ ]
+ end
+
+ it 'lexes an attribute with an unquoted value containing a dash' do
+ lex_html('').should == [
+ [:T_ELEM_START, nil, 1],
+ [:T_ELEM_NAME, 'a', 1],
+ [:T_ATTR, 'href', 1],
+ [:T_STRING_SQUOTE, nil, 1],
+ [:T_STRING_BODY, 'foo-bar', 1],
+ [:T_STRING_SQUOTE, nil, 1],
+ [:T_ELEM_END, nil, 1]
+ ]
+ end
+ end
+end
diff --git a/spec/support/parsing_helpers.rb b/spec/support/parsing_helpers.rb
index a570dee..36f47e6 100644
--- a/spec/support/parsing_helpers.rb
+++ b/spec/support/parsing_helpers.rb
@@ -36,6 +36,13 @@ module Oga
return lex(StringIO.new(input), options)
end
+ ##
+ # @see [#lex]
+ #
+ def lex_html(input)
+ return Oga::XML::Lexer.new(input, :html => true).lex
+ end
+
##
# Lexes an XPath expression.
#