From afbb5858122d5aece252b957b3988787ed76168f Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Wed, 15 Apr 2015 01:23:46 +0200 Subject: [PATCH] Lexing support for unquoted HTML attribute values This adds support for HTML such as: HTML is a child of Satan itself Fixes #94 --- ext/c/lexer.rl | 5 ++ ext/java/org/liboga/xml/Lexer.rl | 2 + ext/ragel/base_lexer.rl | 51 ++++++++++++++++++-- spec/oga/xml/lexer/html_attributes_spec.rb | 54 ++++++++++++++++++++++ spec/support/parsing_helpers.rb | 7 +++ 5 files changed, 116 insertions(+), 3 deletions(-) create mode 100644 spec/oga/xml/lexer/html_attributes_spec.rb diff --git a/ext/c/lexer.rl b/ext/c/lexer.rl index 8513c32..c1bff37 100644 --- a/ext/c/lexer.rl +++ b/ext/c/lexer.rl @@ -24,6 +24,7 @@ on `ts` and `te`) so the macro ignores this argument. ID id_advance_line; ID id_literal_html_element_p; +ID id_html; %%machine c_lexer; @@ -75,6 +76,9 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block) OgaLexerState *state; int lines; + /* Whether or not HTML mode is enabled */ + int html_p = rb_funcall(self, id_html, 0) == Qtrue; + /* Make sure that all data passed back to Ruby has the proper encoding. */ rb_encoding *encoding = rb_enc_get(data_block); @@ -181,6 +185,7 @@ void Init_liboga_xml_lexer() id_advance_line = rb_intern("advance_line"); id_literal_html_element_p = rb_intern("literal_html_element?"); + id_html = rb_intern("html"); rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1); rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0); diff --git a/ext/java/org/liboga/xml/Lexer.rl b/ext/java/org/liboga/xml/Lexer.rl index e04679e..28ada04 100644 --- a/ext/java/org/liboga/xml/Lexer.rl +++ b/ext/java/org/liboga/xml/Lexer.rl @@ -89,6 +89,8 @@ public class Lexer extends RubyObject @JRubyMethod public IRubyObject advance_native(ThreadContext context, RubyString rb_str) { + Boolean html_p = this.callMethod(context, "html").isTrue(); + Encoding encoding = rb_str.getEncoding(); byte[] data = rb_str.getBytes(); diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index a27971e..8606e64 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -52,6 +52,11 @@ if ( fc == '\n' ) lines++; } + action hold_and_return { + fhold; + fret; + } + whitespace = [ \t]; ident_char = [a-zA-Z0-9\-_]; identifier = ident_char+; @@ -370,10 +375,42 @@ }; *|; + # Characters that can be used for unquoted HTML attribute values. + # See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example + # for more info. + html_unquoted_value = ^( + squote | dquote | '`' | '=' | '<' | '>' | whitespace | newline + )+; + + # Machine used for processing HTML attribute values. + html_attribute_value := |* + squote => start_string_squote; + dquote => start_string_dquote; + + # Unquoted attribute values are lexed as if they were single quoted + # strings. + html_unquoted_value => { + callback_simple(id_on_string_squote); + + callback(id_on_string_body, data, encoding, ts, te); + + callback_simple(id_on_string_squote); + }; + + any => hold_and_return; + *|; + + # Machine used for processing XML attribute values. + xml_attribute_value := |* + squote => start_string_squote; + dquote => start_string_dquote; + any => hold_and_return; + *|; + # Machine used for processing the contents of an element's starting tag. # This includes the name, namespace and attributes. element_head := |* - whitespace | '='; + whitespace; newline => { callback_simple(id_advance_line); @@ -389,8 +426,16 @@ }; # Attribute values. - squote => start_string_squote; - dquote => start_string_dquote; + '=' => { + if ( html_p ) + { + fcall html_attribute_value; + } + else + { + fcall xml_attribute_value; + } + }; # We're done with the open tag of the element. '>' => { diff --git a/spec/oga/xml/lexer/html_attributes_spec.rb b/spec/oga/xml/lexer/html_attributes_spec.rb new file mode 100644 index 0000000..41b79d4 --- /dev/null +++ b/spec/oga/xml/lexer/html_attributes_spec.rb @@ -0,0 +1,54 @@ +require 'spec_helper' + +describe Oga::XML::Lexer do + describe 'HTML attributes' do + it 'lexes an attribute with an unquoted value' do + lex_html('').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'a', 1], + [:T_ATTR, 'href', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_STRING_BODY, 'foo', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_ELEM_END, nil, 1] + ] + end + + it 'lexes an attribute with an unquoted value containing a space' do + lex_html('').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'a', 1], + [:T_ATTR, 'href', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_STRING_BODY, 'foo', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_ATTR, 'bar', 1], + [:T_ELEM_END, nil, 1] + ] + end + + it 'lexes an attribute with an unquoted value containing an underscore' do + lex_html('').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'a', 1], + [:T_ATTR, 'href', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_STRING_BODY, 'foo_bar', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_ELEM_END, nil, 1] + ] + end + + it 'lexes an attribute with an unquoted value containing a dash' do + lex_html('').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'a', 1], + [:T_ATTR, 'href', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_STRING_BODY, 'foo-bar', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_ELEM_END, nil, 1] + ] + end + end +end diff --git a/spec/support/parsing_helpers.rb b/spec/support/parsing_helpers.rb index a570dee..36f47e6 100644 --- a/spec/support/parsing_helpers.rb +++ b/spec/support/parsing_helpers.rb @@ -36,6 +36,13 @@ module Oga return lex(StringIO.new(input), options) end + ## + # @see [#lex] + # + def lex_html(input) + return Oga::XML::Lexer.new(input, :html => true).lex + end + ## # Lexes an XPath expression. #