Handle lexing of stray quotes in element heads
This adds lexing support for HTML/XML such as: <foo bar="""></foo> While technically invalid, some websites (e.g. yahoo.com) contain HTML just like this. The lexer handles this as following: 1. When we're in the "element_head" machine, do business as usual until we bump into a "=". 2. Call (using Ragel's "fcall") the machine to use for processing the attribute value (if any). 3. In this machine quoted strings are processed. The moment a string has been processed the lexer jumps right back in to the "element_head" machine. This ensures that any stray quotes are ignored instead of being processed as extra attribute values (eventually leading to parsing errors due to unbalanced quotes).
This commit is contained in:
parent
9a0e31d0ae
commit
6b779d7883
|
@ -61,6 +61,11 @@
|
|||
advance_line(1)
|
||||
}
|
||||
|
||||
action hold_and_return {
|
||||
fhold;
|
||||
fret;
|
||||
}
|
||||
|
||||
# Comments
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#comments
|
||||
|
@ -383,11 +388,6 @@
|
|||
};
|
||||
*|;
|
||||
|
||||
action hold_start_element_head {
|
||||
fhold;
|
||||
fnext element_head;
|
||||
}
|
||||
|
||||
# Characters that can be used for unquoted HTML attribute values.
|
||||
# See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
|
||||
# for more info.
|
||||
|
@ -412,21 +412,32 @@
|
|||
callback_simple(id_on_string_squote);
|
||||
};
|
||||
|
||||
any => hold_start_element_head;
|
||||
any => hold_and_return;
|
||||
*|;
|
||||
|
||||
# Machine used for processing XML attribute values.
|
||||
xml_attribute_value := |*
|
||||
squote => start_string_squote;
|
||||
dquote => start_string_dquote;
|
||||
any => hold_start_element_head;
|
||||
# The following two actions use "fnext" instead of "fcall". Combined
|
||||
# with "element_head" using "fcall" to jump to this machine this means
|
||||
# we can return back to "element_head" after processing a single string.
|
||||
squote => {
|
||||
callback_simple(id_on_string_squote);
|
||||
|
||||
fnext string_squote;
|
||||
};
|
||||
|
||||
dquote => {
|
||||
callback_simple(id_on_string_dquote);
|
||||
|
||||
fnext string_dquote;
|
||||
};
|
||||
|
||||
any => hold_and_return;
|
||||
*|;
|
||||
|
||||
# Machine used for processing the contents of an element's starting tag.
|
||||
# This includes the name, namespace and attributes.
|
||||
element_head := |*
|
||||
whitespace;
|
||||
|
||||
newline => advance_newline;
|
||||
|
||||
# Attribute names and namespaces.
|
||||
|
@ -442,11 +453,11 @@
|
|||
'=' => {
|
||||
if ( html_p )
|
||||
{
|
||||
fnext html_attribute_value;
|
||||
fcall html_attribute_value;
|
||||
}
|
||||
else
|
||||
{
|
||||
fnext xml_attribute_value;
|
||||
fcall xml_attribute_value;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -469,6 +480,8 @@
|
|||
callback_simple(id_on_element_end);
|
||||
fnext main;
|
||||
};
|
||||
|
||||
any;
|
||||
*|;
|
||||
|
||||
# Text
|
||||
|
|
|
@ -9,6 +9,20 @@ describe Oga::XML::Lexer do
|
|||
]
|
||||
end
|
||||
|
||||
it 'lexes an opening element with a stray double quote' do
|
||||
lex('<p">').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'p', 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes an opening element with a stray double quoted string' do
|
||||
lex('<p"">').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'p', 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes an opening an closing element' do
|
||||
lex('<p></p>').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
|
@ -17,6 +31,22 @@ describe Oga::XML::Lexer do
|
|||
]
|
||||
end
|
||||
|
||||
it 'lexes an opening an closing element with a stray double quote' do
|
||||
lex('<p"></p>').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'p', 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes an opening an closing element with a stray double quoted string' do
|
||||
lex('<p""></p>').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'p', 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes a paragraph element with text inside it' do
|
||||
lex('<p>Hello</p>').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
|
@ -61,6 +91,15 @@ describe Oga::XML::Lexer do
|
|||
]
|
||||
end
|
||||
|
||||
it 'lexes an element with an empty attribute followed by a stray double quote' do
|
||||
lex('<p foo"></p>').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'p', 1],
|
||||
[:T_ATTR, 'foo', 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes an element with an attribute with an empty value' do
|
||||
lex('<p foo=""></p>').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
|
@ -72,6 +111,28 @@ describe Oga::XML::Lexer do
|
|||
]
|
||||
end
|
||||
|
||||
it 'lexes an attribute value followed by a stray double quote' do
|
||||
lex('<p foo="""></p>').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'p', 1],
|
||||
[:T_ATTR, 'foo', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes an attribute value followed by a stray single quote' do
|
||||
lex('<p foo=""\'></p>').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'p', 1],
|
||||
[:T_ATTR, 'foo', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes a paragraph element with attributes' do
|
||||
lex('<p class="foo">Hello</p>').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
|
|
Loading…
Reference in New Issue