Fix for lexing HTML quoted attrs followed by "/>"

This ensures that when using input such as <a href="foo"/> the "/" is
not part of the attribute value.
This commit is contained in:
Yorick Peterse 2015-04-15 01:47:08 +02:00
parent afbb585812
commit d892ce9787
4 changed files with 107 additions and 16 deletions

View File

@ -52,11 +52,6 @@
if ( fc == '\n' ) lines++; if ( fc == '\n' ) lines++;
} }
action hold_and_return {
fhold;
fret;
}
whitespace = [ \t]; whitespace = [ \t];
ident_char = [a-zA-Z0-9\-_]; ident_char = [a-zA-Z0-9\-_];
identifier = ident_char+; identifier = ident_char+;
@ -375,6 +370,11 @@
}; };
*|; *|;
action hold_start_element_head {
fhold;
fnext element_head;
}
# Characters that can be used for unquoted HTML attribute values. # Characters that can be used for unquoted HTML attribute values.
# See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example # See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
# for more info. # for more info.
@ -384,8 +384,10 @@
# Machine used for processing HTML attribute values. # Machine used for processing HTML attribute values.
html_attribute_value := |* html_attribute_value := |*
squote => start_string_squote; squote | dquote => {
dquote => start_string_dquote; fhold;
fnext html_attribute_value_quoted;
};
# Unquoted attribute values are lexed as if they were single quoted # Unquoted attribute values are lexed as if they were single quoted
# strings. # strings.
@ -397,14 +399,23 @@
callback_simple(id_on_string_squote); callback_simple(id_on_string_squote);
}; };
any => hold_and_return; any => hold_start_element_head;
*|;
# Machine specifically used when dealing with quoted HTML attributes. This
# ensures that input such as <a href="foo"/> doesn't result in "/" being
# considered part of the attribute value.
html_attribute_value_quoted := |*
squote => start_string_squote;
dquote => start_string_dquote;
any => hold_start_element_head;
*|; *|;
# Machine used for processing XML attribute values. # Machine used for processing XML attribute values.
xml_attribute_value := |* xml_attribute_value := |*
squote => start_string_squote; squote => start_string_squote;
dquote => start_string_dquote; dquote => start_string_dquote;
any => hold_and_return; any => hold_start_element_head;
*|; *|;
# Machine used for processing the contents of an element's starting tag. # Machine used for processing the contents of an element's starting tag.
@ -429,11 +440,11 @@
'=' => { '=' => {
if ( html_p ) if ( html_p )
{ {
fcall html_attribute_value; fnext html_attribute_value;
} }
else else
{ {
fcall xml_attribute_value; fnext xml_attribute_value;
} }
}; };

View File

@ -171,6 +171,28 @@ describe Oga::XML::Lexer do
[:T_ELEM_END, nil, 1] [:T_ELEM_END, nil, 1]
] ]
end end
describe 'without a space before the closing tag' do
it 'lexes a void element' do
lex('<br/>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'br', 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes a void element with an attribute' do
lex('<br class="foo"/>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'br', 1],
[:T_ATTR, 'class', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, 'foo', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
end
end end
describe 'elements with namespaces' do describe 'elements with namespaces' do

View File

@ -50,5 +50,17 @@ describe Oga::XML::Lexer do
[:T_ELEM_END, nil, 1] [:T_ELEM_END, nil, 1]
] ]
end end
it 'lexes an attribute with an unquoted value containing a slash' do
lex_html('<a href=foo/></a>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'a', 1],
[:T_ATTR, 'href', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_STRING_BODY, 'foo/', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
end end
end end

View File

@ -3,7 +3,7 @@ require 'spec_helper'
describe Oga::XML::Lexer do describe Oga::XML::Lexer do
describe 'HTML void elements' do describe 'HTML void elements' do
it 'lexes a void element that omits the closing /' do it 'lexes a void element that omits the closing /' do
lex('<link>', :html => true).should == [ lex_html('<link>').should == [
[:T_ELEM_START, nil, 1], [:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'link', 1], [:T_ELEM_NAME, 'link', 1],
[:T_ELEM_END, nil, 1] [:T_ELEM_END, nil, 1]
@ -11,7 +11,7 @@ describe Oga::XML::Lexer do
end end
it 'lexes a upper case void element' do it 'lexes a upper case void element' do
lex('<BR>', :html => true).should == [ lex_html('<BR>').should == [
[:T_ELEM_START, nil, 1], [:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, "BR", 1], [:T_ELEM_NAME, "BR", 1],
[:T_ELEM_END, nil, 1] [:T_ELEM_END, nil, 1]
@ -19,7 +19,7 @@ describe Oga::XML::Lexer do
end end
it 'lexes text after a void element' do it 'lexes text after a void element' do
lex('<link>foo', :html => true).should == [ lex_html('<link>foo').should == [
[:T_ELEM_START, nil, 1], [:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'link', 1], [:T_ELEM_NAME, 'link', 1],
[:T_ELEM_END, nil, 1], [:T_ELEM_END, nil, 1],
@ -28,7 +28,7 @@ describe Oga::XML::Lexer do
end end
it 'lexes a void element inside another element' do it 'lexes a void element inside another element' do
lex('<head><link></head>', :html => true).should == [ lex_html('<head><link></head>').should == [
[:T_ELEM_START, nil, 1], [:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'head', 1], [:T_ELEM_NAME, 'head', 1],
[:T_ELEM_START, nil, 1], [:T_ELEM_START, nil, 1],
@ -39,7 +39,7 @@ describe Oga::XML::Lexer do
end end
it 'lexes a void element inside another element with whitespace' do it 'lexes a void element inside another element with whitespace' do
lex("<head><link>\n</head>", :html => true).should == [ lex_html("<head><link>\n</head>").should == [
[:T_ELEM_START, nil, 1], [:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'head', 1], [:T_ELEM_NAME, 'head', 1],
[:T_ELEM_START, nil, 1], [:T_ELEM_START, nil, 1],
@ -49,5 +49,51 @@ describe Oga::XML::Lexer do
[:T_ELEM_END, nil, 2] [:T_ELEM_END, nil, 2]
] ]
end end
it 'lexes a void element with an unquoted attribute value' do
lex_html('<br class=foo />').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'br', 1],
[:T_ATTR, 'class', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_STRING_BODY, 'foo', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
describe 'without a space before the closing tag' do
it 'lexes a void element' do
lex_html('<br/>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'br', 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes a void element with an attribute' do
lex_html('<br class="foo"/>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'br', 1],
[:T_ATTR, 'class', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, 'foo', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes a void element with an unquoted attribute value' do
lex_html('<br class=foo/>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'br', 1],
[:T_ATTR, 'class', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_STRING_BODY, 'foo/', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
end
end end
end end