Lexing support for unquoted HTML attribute values

This adds support for HTML such as:

    <a href=foo>HTML is a child of Satan itself</a>

Fixes #94
This commit is contained in:
Yorick Peterse 2015-04-15 01:23:46 +02:00
parent 23a441933a
commit afbb585812
5 changed files with 116 additions and 3 deletions

View File

@ -24,6 +24,7 @@ on `ts` and `te`) so the macro ignores this argument.
ID id_advance_line; ID id_advance_line;
ID id_literal_html_element_p; ID id_literal_html_element_p;
ID id_html;
%%machine c_lexer; %%machine c_lexer;
@ -75,6 +76,9 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
OgaLexerState *state; OgaLexerState *state;
int lines; int lines;
/* Whether or not HTML mode is enabled */
int html_p = rb_funcall(self, id_html, 0) == Qtrue;
/* Make sure that all data passed back to Ruby has the proper encoding. */ /* Make sure that all data passed back to Ruby has the proper encoding. */
rb_encoding *encoding = rb_enc_get(data_block); rb_encoding *encoding = rb_enc_get(data_block);
@ -181,6 +185,7 @@ void Init_liboga_xml_lexer()
id_advance_line = rb_intern("advance_line"); id_advance_line = rb_intern("advance_line");
id_literal_html_element_p = rb_intern("literal_html_element?"); id_literal_html_element_p = rb_intern("literal_html_element?");
id_html = rb_intern("html");
rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1); rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1);
rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0); rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0);

View File

@ -89,6 +89,8 @@ public class Lexer extends RubyObject
@JRubyMethod @JRubyMethod
public IRubyObject advance_native(ThreadContext context, RubyString rb_str) public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
{ {
Boolean html_p = this.callMethod(context, "html").isTrue();
Encoding encoding = rb_str.getEncoding(); Encoding encoding = rb_str.getEncoding();
byte[] data = rb_str.getBytes(); byte[] data = rb_str.getBytes();

View File

@ -52,6 +52,11 @@
if ( fc == '\n' ) lines++; if ( fc == '\n' ) lines++;
} }
action hold_and_return {
fhold;
fret;
}
whitespace = [ \t]; whitespace = [ \t];
ident_char = [a-zA-Z0-9\-_]; ident_char = [a-zA-Z0-9\-_];
identifier = ident_char+; identifier = ident_char+;
@ -370,10 +375,42 @@
}; };
*|; *|;
# Characters that can be used for unquoted HTML attribute values.
# See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
# for more info.
html_unquoted_value = ^(
squote | dquote | '`' | '=' | '<' | '>' | whitespace | newline
)+;
# Machine used for processing HTML attribute values.
html_attribute_value := |*
squote => start_string_squote;
dquote => start_string_dquote;
# Unquoted attribute values are lexed as if they were single quoted
# strings.
html_unquoted_value => {
callback_simple(id_on_string_squote);
callback(id_on_string_body, data, encoding, ts, te);
callback_simple(id_on_string_squote);
};
any => hold_and_return;
*|;
# Machine used for processing XML attribute values.
xml_attribute_value := |*
squote => start_string_squote;
dquote => start_string_dquote;
any => hold_and_return;
*|;
# Machine used for processing the contents of an element's starting tag. # Machine used for processing the contents of an element's starting tag.
# This includes the name, namespace and attributes. # This includes the name, namespace and attributes.
element_head := |* element_head := |*
whitespace | '='; whitespace;
newline => { newline => {
callback_simple(id_advance_line); callback_simple(id_advance_line);
@ -389,8 +426,16 @@
}; };
# Attribute values. # Attribute values.
squote => start_string_squote; '=' => {
dquote => start_string_dquote; if ( html_p )
{
fcall html_attribute_value;
}
else
{
fcall xml_attribute_value;
}
};
# We're done with the open tag of the element. # We're done with the open tag of the element.
'>' => { '>' => {

View File

@ -0,0 +1,54 @@
require 'spec_helper'
describe Oga::XML::Lexer do
describe 'HTML attributes' do
it 'lexes an attribute with an unquoted value' do
lex_html('<a href=foo></a>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'a', 1],
[:T_ATTR, 'href', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_STRING_BODY, 'foo', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes an attribute with an unquoted value containing a space' do
lex_html('<a href=foo bar></a>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'a', 1],
[:T_ATTR, 'href', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_STRING_BODY, 'foo', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_ATTR, 'bar', 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes an attribute with an unquoted value containing an underscore' do
lex_html('<a href=foo_bar></a>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'a', 1],
[:T_ATTR, 'href', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_STRING_BODY, 'foo_bar', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes an attribute with an unquoted value containing a dash' do
lex_html('<a href=foo-bar></a>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'a', 1],
[:T_ATTR, 'href', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_STRING_BODY, 'foo-bar', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
end
end

View File

@ -36,6 +36,13 @@ module Oga
return lex(StringIO.new(input), options) return lex(StringIO.new(input), options)
end end
##
# @see [#lex]
#
def lex_html(input)
return Oga::XML::Lexer.new(input, :html => true).lex
end
## ##
# Lexes an XPath expression. # Lexes an XPath expression.
# #