Fix for lexing HTML quoted attrs followed by "/>"

This ensures that when using input such as <a href="foo"/> the "/" is
not part of the attribute value.
This commit is contained in:
Yorick Peterse 2015-04-15 01:47:08 +02:00
parent afbb585812
commit d892ce9787
4 changed files with 107 additions and 16 deletions

View File

@ -52,11 +52,6 @@
if ( fc == '\n' ) lines++;
}
action hold_and_return {
fhold;
fret;
}
whitespace = [ \t];
ident_char = [a-zA-Z0-9\-_];
identifier = ident_char+;
@ -375,6 +370,11 @@
};
*|;
action hold_start_element_head {
fhold;
fnext element_head;
}
# Characters that can be used for unquoted HTML attribute values.
# See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
# for more info.
@ -384,8 +384,10 @@
# Machine used for processing HTML attribute values.
html_attribute_value := |*
squote => start_string_squote;
dquote => start_string_dquote;
squote | dquote => {
fhold;
fnext html_attribute_value_quoted;
};
# Unquoted attribute values are lexed as if they were single quoted
# strings.
@ -397,14 +399,23 @@
callback_simple(id_on_string_squote);
};
any => hold_and_return;
any => hold_start_element_head;
*|;
# Machine specifically used when dealing with quoted HTML attributes. This
# ensures that input such as <a href="foo"/> doesn't result in "/" being
# considered part of the attribute value.
html_attribute_value_quoted := |*
squote => start_string_squote;
dquote => start_string_dquote;
any => hold_start_element_head;
*|;
# Machine used for processing XML attribute values.
xml_attribute_value := |*
squote => start_string_squote;
dquote => start_string_dquote;
any => hold_and_return;
any => hold_start_element_head;
*|;
# Machine used for processing the contents of an element's starting tag.
@ -429,11 +440,11 @@
'=' => {
if ( html_p )
{
fcall html_attribute_value;
fnext html_attribute_value;
}
else
{
fcall xml_attribute_value;
fnext xml_attribute_value;
}
};

View File

@ -171,6 +171,28 @@ describe Oga::XML::Lexer do
[:T_ELEM_END, nil, 1]
]
end
describe 'without a space before the closing tag' do
it 'lexes a void element' do
lex('<br/>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'br', 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes a void element with an attribute' do
lex('<br class="foo"/>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'br', 1],
[:T_ATTR, 'class', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, 'foo', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
end
end
describe 'elements with namespaces' do

View File

@ -50,5 +50,17 @@ describe Oga::XML::Lexer do
[:T_ELEM_END, nil, 1]
]
end
it 'lexes an attribute with an unquoted value containing a slash' do
lex_html('<a href=foo/></a>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'a', 1],
[:T_ATTR, 'href', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_STRING_BODY, 'foo/', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
end
end

View File

@ -3,7 +3,7 @@ require 'spec_helper'
describe Oga::XML::Lexer do
describe 'HTML void elements' do
it 'lexes a void element that omits the closing /' do
lex('<link>', :html => true).should == [
lex_html('<link>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'link', 1],
[:T_ELEM_END, nil, 1]
@ -11,7 +11,7 @@ describe Oga::XML::Lexer do
end
it 'lexes a upper case void element' do
lex('<BR>', :html => true).should == [
lex_html('<BR>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, "BR", 1],
[:T_ELEM_END, nil, 1]
@ -19,7 +19,7 @@ describe Oga::XML::Lexer do
end
it 'lexes text after a void element' do
lex('<link>foo', :html => true).should == [
lex_html('<link>foo').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'link', 1],
[:T_ELEM_END, nil, 1],
@ -28,7 +28,7 @@ describe Oga::XML::Lexer do
end
it 'lexes a void element inside another element' do
lex('<head><link></head>', :html => true).should == [
lex_html('<head><link></head>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'head', 1],
[:T_ELEM_START, nil, 1],
@ -39,7 +39,7 @@ describe Oga::XML::Lexer do
end
it 'lexes a void element inside another element with whitespace' do
lex("<head><link>\n</head>", :html => true).should == [
lex_html("<head><link>\n</head>").should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'head', 1],
[:T_ELEM_START, nil, 1],
@ -49,5 +49,51 @@ describe Oga::XML::Lexer do
[:T_ELEM_END, nil, 2]
]
end
it 'lexes a void element with an unquoted attribute value' do
lex_html('<br class=foo />').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'br', 1],
[:T_ATTR, 'class', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_STRING_BODY, 'foo', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
describe 'without a space before the closing tag' do
it 'lexes a void element' do
lex_html('<br/>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'br', 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes a void element with an attribute' do
lex_html('<br class="foo"/>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'br', 1],
[:T_ATTR, 'class', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, 'foo', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes a void element with an unquoted attribute value' do
lex_html('<br class=foo/>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'br', 1],
[:T_ATTR, 'class', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_STRING_BODY, 'foo/', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
end
end
end