Lexing support for unquoted HTML attribute values
This adds support for HTML such as: <a href=foo>HTML is a child of Satan itself</a> Fixes #94
This commit is contained in:
parent
23a441933a
commit
afbb585812
|
@ -24,6 +24,7 @@ on `ts` and `te`) so the macro ignores this argument.
|
|||
|
||||
ID id_advance_line;
|
||||
ID id_literal_html_element_p;
|
||||
ID id_html;
|
||||
|
||||
%%machine c_lexer;
|
||||
|
||||
|
@ -75,6 +76,9 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
|
|||
OgaLexerState *state;
|
||||
int lines;
|
||||
|
||||
/* Whether or not HTML mode is enabled */
|
||||
int html_p = rb_funcall(self, id_html, 0) == Qtrue;
|
||||
|
||||
/* Make sure that all data passed back to Ruby has the proper encoding. */
|
||||
rb_encoding *encoding = rb_enc_get(data_block);
|
||||
|
||||
|
@ -181,6 +185,7 @@ void Init_liboga_xml_lexer()
|
|||
|
||||
id_advance_line = rb_intern("advance_line");
|
||||
id_literal_html_element_p = rb_intern("literal_html_element?");
|
||||
id_html = rb_intern("html");
|
||||
|
||||
rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1);
|
||||
rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0);
|
||||
|
|
|
@ -89,6 +89,8 @@ public class Lexer extends RubyObject
|
|||
@JRubyMethod
|
||||
public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
|
||||
{
|
||||
Boolean html_p = this.callMethod(context, "html").isTrue();
|
||||
|
||||
Encoding encoding = rb_str.getEncoding();
|
||||
|
||||
byte[] data = rb_str.getBytes();
|
||||
|
|
|
@ -52,6 +52,11 @@
|
|||
if ( fc == '\n' ) lines++;
|
||||
}
|
||||
|
||||
action hold_and_return {
|
||||
fhold;
|
||||
fret;
|
||||
}
|
||||
|
||||
whitespace = [ \t];
|
||||
ident_char = [a-zA-Z0-9\-_];
|
||||
identifier = ident_char+;
|
||||
|
@ -370,10 +375,42 @@
|
|||
};
|
||||
*|;
|
||||
|
||||
# Characters that can be used for unquoted HTML attribute values.
|
||||
# See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
|
||||
# for more info.
|
||||
html_unquoted_value = ^(
|
||||
squote | dquote | '`' | '=' | '<' | '>' | whitespace | newline
|
||||
)+;
|
||||
|
||||
# Machine used for processing HTML attribute values.
|
||||
html_attribute_value := |*
|
||||
squote => start_string_squote;
|
||||
dquote => start_string_dquote;
|
||||
|
||||
# Unquoted attribute values are lexed as if they were single quoted
|
||||
# strings.
|
||||
html_unquoted_value => {
|
||||
callback_simple(id_on_string_squote);
|
||||
|
||||
callback(id_on_string_body, data, encoding, ts, te);
|
||||
|
||||
callback_simple(id_on_string_squote);
|
||||
};
|
||||
|
||||
any => hold_and_return;
|
||||
*|;
|
||||
|
||||
# Machine used for processing XML attribute values.
|
||||
xml_attribute_value := |*
|
||||
squote => start_string_squote;
|
||||
dquote => start_string_dquote;
|
||||
any => hold_and_return;
|
||||
*|;
|
||||
|
||||
# Machine used for processing the contents of an element's starting tag.
|
||||
# This includes the name, namespace and attributes.
|
||||
element_head := |*
|
||||
whitespace | '=';
|
||||
whitespace;
|
||||
|
||||
newline => {
|
||||
callback_simple(id_advance_line);
|
||||
|
@ -389,8 +426,16 @@
|
|||
};
|
||||
|
||||
# Attribute values.
|
||||
squote => start_string_squote;
|
||||
dquote => start_string_dquote;
|
||||
'=' => {
|
||||
if ( html_p )
|
||||
{
|
||||
fcall html_attribute_value;
|
||||
}
|
||||
else
|
||||
{
|
||||
fcall xml_attribute_value;
|
||||
}
|
||||
};
|
||||
|
||||
# We're done with the open tag of the element.
|
||||
'>' => {
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::XML::Lexer do
|
||||
describe 'HTML attributes' do
|
||||
it 'lexes an attribute with an unquoted value' do
|
||||
lex_html('<a href=foo></a>').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'a', 1],
|
||||
[:T_ATTR, 'href', 1],
|
||||
[:T_STRING_SQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, 'foo', 1],
|
||||
[:T_STRING_SQUOTE, nil, 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes an attribute with an unquoted value containing a space' do
|
||||
lex_html('<a href=foo bar></a>').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'a', 1],
|
||||
[:T_ATTR, 'href', 1],
|
||||
[:T_STRING_SQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, 'foo', 1],
|
||||
[:T_STRING_SQUOTE, nil, 1],
|
||||
[:T_ATTR, 'bar', 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes an attribute with an unquoted value containing an underscore' do
|
||||
lex_html('<a href=foo_bar></a>').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'a', 1],
|
||||
[:T_ATTR, 'href', 1],
|
||||
[:T_STRING_SQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, 'foo_bar', 1],
|
||||
[:T_STRING_SQUOTE, nil, 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes an attribute with an unquoted value containing a dash' do
|
||||
lex_html('<a href=foo-bar></a>').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'a', 1],
|
||||
[:T_ATTR, 'href', 1],
|
||||
[:T_STRING_SQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, 'foo-bar', 1],
|
||||
[:T_STRING_SQUOTE, nil, 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
end
|
||||
end
|
|
@ -36,6 +36,13 @@ module Oga
|
|||
return lex(StringIO.new(input), options)
|
||||
end
|
||||
|
||||
##
|
||||
# @see [#lex]
|
||||
#
|
||||
def lex_html(input)
|
||||
return Oga::XML::Lexer.new(input, :html => true).lex
|
||||
end
|
||||
|
||||
##
|
||||
# Lexes an XPath expression.
|
||||
#
|
||||
|
|
Loading…
Reference in New Issue