Lexing support for unquoted HTML attribute values
This adds support for HTML such as: <a href=foo>HTML is a child of Satan itself</a> Fixes #94
This commit is contained in:
parent
23a441933a
commit
afbb585812
|
@ -24,6 +24,7 @@ on `ts` and `te`) so the macro ignores this argument.
|
||||||
|
|
||||||
ID id_advance_line;
|
ID id_advance_line;
|
||||||
ID id_literal_html_element_p;
|
ID id_literal_html_element_p;
|
||||||
|
ID id_html;
|
||||||
|
|
||||||
%%machine c_lexer;
|
%%machine c_lexer;
|
||||||
|
|
||||||
|
@ -75,6 +76,9 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
|
||||||
OgaLexerState *state;
|
OgaLexerState *state;
|
||||||
int lines;
|
int lines;
|
||||||
|
|
||||||
|
/* Whether or not HTML mode is enabled */
|
||||||
|
int html_p = rb_funcall(self, id_html, 0) == Qtrue;
|
||||||
|
|
||||||
/* Make sure that all data passed back to Ruby has the proper encoding. */
|
/* Make sure that all data passed back to Ruby has the proper encoding. */
|
||||||
rb_encoding *encoding = rb_enc_get(data_block);
|
rb_encoding *encoding = rb_enc_get(data_block);
|
||||||
|
|
||||||
|
@ -181,6 +185,7 @@ void Init_liboga_xml_lexer()
|
||||||
|
|
||||||
id_advance_line = rb_intern("advance_line");
|
id_advance_line = rb_intern("advance_line");
|
||||||
id_literal_html_element_p = rb_intern("literal_html_element?");
|
id_literal_html_element_p = rb_intern("literal_html_element?");
|
||||||
|
id_html = rb_intern("html");
|
||||||
|
|
||||||
rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1);
|
rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1);
|
||||||
rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0);
|
rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0);
|
||||||
|
|
|
@ -89,6 +89,8 @@ public class Lexer extends RubyObject
|
||||||
@JRubyMethod
|
@JRubyMethod
|
||||||
public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
|
public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
|
||||||
{
|
{
|
||||||
|
Boolean html_p = this.callMethod(context, "html").isTrue();
|
||||||
|
|
||||||
Encoding encoding = rb_str.getEncoding();
|
Encoding encoding = rb_str.getEncoding();
|
||||||
|
|
||||||
byte[] data = rb_str.getBytes();
|
byte[] data = rb_str.getBytes();
|
||||||
|
|
|
@ -52,6 +52,11 @@
|
||||||
if ( fc == '\n' ) lines++;
|
if ( fc == '\n' ) lines++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
action hold_and_return {
|
||||||
|
fhold;
|
||||||
|
fret;
|
||||||
|
}
|
||||||
|
|
||||||
whitespace = [ \t];
|
whitespace = [ \t];
|
||||||
ident_char = [a-zA-Z0-9\-_];
|
ident_char = [a-zA-Z0-9\-_];
|
||||||
identifier = ident_char+;
|
identifier = ident_char+;
|
||||||
|
@ -370,10 +375,42 @@
|
||||||
};
|
};
|
||||||
*|;
|
*|;
|
||||||
|
|
||||||
|
# Characters that can be used for unquoted HTML attribute values.
|
||||||
|
# See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
|
||||||
|
# for more info.
|
||||||
|
html_unquoted_value = ^(
|
||||||
|
squote | dquote | '`' | '=' | '<' | '>' | whitespace | newline
|
||||||
|
)+;
|
||||||
|
|
||||||
|
# Machine used for processing HTML attribute values.
|
||||||
|
html_attribute_value := |*
|
||||||
|
squote => start_string_squote;
|
||||||
|
dquote => start_string_dquote;
|
||||||
|
|
||||||
|
# Unquoted attribute values are lexed as if they were single quoted
|
||||||
|
# strings.
|
||||||
|
html_unquoted_value => {
|
||||||
|
callback_simple(id_on_string_squote);
|
||||||
|
|
||||||
|
callback(id_on_string_body, data, encoding, ts, te);
|
||||||
|
|
||||||
|
callback_simple(id_on_string_squote);
|
||||||
|
};
|
||||||
|
|
||||||
|
any => hold_and_return;
|
||||||
|
*|;
|
||||||
|
|
||||||
|
# Machine used for processing XML attribute values.
|
||||||
|
xml_attribute_value := |*
|
||||||
|
squote => start_string_squote;
|
||||||
|
dquote => start_string_dquote;
|
||||||
|
any => hold_and_return;
|
||||||
|
*|;
|
||||||
|
|
||||||
# Machine used for processing the contents of an element's starting tag.
|
# Machine used for processing the contents of an element's starting tag.
|
||||||
# This includes the name, namespace and attributes.
|
# This includes the name, namespace and attributes.
|
||||||
element_head := |*
|
element_head := |*
|
||||||
whitespace | '=';
|
whitespace;
|
||||||
|
|
||||||
newline => {
|
newline => {
|
||||||
callback_simple(id_advance_line);
|
callback_simple(id_advance_line);
|
||||||
|
@ -389,8 +426,16 @@
|
||||||
};
|
};
|
||||||
|
|
||||||
# Attribute values.
|
# Attribute values.
|
||||||
squote => start_string_squote;
|
'=' => {
|
||||||
dquote => start_string_dquote;
|
if ( html_p )
|
||||||
|
{
|
||||||
|
fcall html_attribute_value;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
fcall xml_attribute_value;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
# We're done with the open tag of the element.
|
# We're done with the open tag of the element.
|
||||||
'>' => {
|
'>' => {
|
||||||
|
|
|
@ -0,0 +1,54 @@
|
||||||
|
require 'spec_helper'
|
||||||
|
|
||||||
|
describe Oga::XML::Lexer do
|
||||||
|
describe 'HTML attributes' do
|
||||||
|
it 'lexes an attribute with an unquoted value' do
|
||||||
|
lex_html('<a href=foo></a>').should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'a', 1],
|
||||||
|
[:T_ATTR, 'href', 1],
|
||||||
|
[:T_STRING_SQUOTE, nil, 1],
|
||||||
|
[:T_STRING_BODY, 'foo', 1],
|
||||||
|
[:T_STRING_SQUOTE, nil, 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'lexes an attribute with an unquoted value containing a space' do
|
||||||
|
lex_html('<a href=foo bar></a>').should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'a', 1],
|
||||||
|
[:T_ATTR, 'href', 1],
|
||||||
|
[:T_STRING_SQUOTE, nil, 1],
|
||||||
|
[:T_STRING_BODY, 'foo', 1],
|
||||||
|
[:T_STRING_SQUOTE, nil, 1],
|
||||||
|
[:T_ATTR, 'bar', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'lexes an attribute with an unquoted value containing an underscore' do
|
||||||
|
lex_html('<a href=foo_bar></a>').should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'a', 1],
|
||||||
|
[:T_ATTR, 'href', 1],
|
||||||
|
[:T_STRING_SQUOTE, nil, 1],
|
||||||
|
[:T_STRING_BODY, 'foo_bar', 1],
|
||||||
|
[:T_STRING_SQUOTE, nil, 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'lexes an attribute with an unquoted value containing a dash' do
|
||||||
|
lex_html('<a href=foo-bar></a>').should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'a', 1],
|
||||||
|
[:T_ATTR, 'href', 1],
|
||||||
|
[:T_STRING_SQUOTE, nil, 1],
|
||||||
|
[:T_STRING_BODY, 'foo-bar', 1],
|
||||||
|
[:T_STRING_SQUOTE, nil, 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -36,6 +36,13 @@ module Oga
|
||||||
return lex(StringIO.new(input), options)
|
return lex(StringIO.new(input), options)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
##
|
||||||
|
# @see [#lex]
|
||||||
|
#
|
||||||
|
def lex_html(input)
|
||||||
|
return Oga::XML::Lexer.new(input, :html => true).lex
|
||||||
|
end
|
||||||
|
|
||||||
##
|
##
|
||||||
# Lexes an XPath expression.
|
# Lexes an XPath expression.
|
||||||
#
|
#
|
||||||
|
|
Loading…
Reference in New Issue