Handle lexing of stray quotes in element heads
This adds lexing support for HTML/XML such as: <foo bar="""></foo> While technically invalid, some websites (e.g. yahoo.com) contain HTML just like this. The lexer handles this as following: 1. When we're in the "element_head" machine, do business as usual until we bump into a "=". 2. Call (using Ragel's "fcall") the machine to use for processing the attribute value (if any). 3. In this machine quoted strings are processed. The moment a string has been processed the lexer jumps right back in to the "element_head" machine. This ensures that any stray quotes are ignored instead of being processed as extra attribute values (eventually leading to parsing errors due to unbalanced quotes).
This commit is contained in:
parent
9a0e31d0ae
commit
6b779d7883
|
@ -61,6 +61,11 @@
|
||||||
advance_line(1)
|
advance_line(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
action hold_and_return {
|
||||||
|
fhold;
|
||||||
|
fret;
|
||||||
|
}
|
||||||
|
|
||||||
# Comments
|
# Comments
|
||||||
#
|
#
|
||||||
# http://www.w3.org/TR/html-markup/syntax.html#comments
|
# http://www.w3.org/TR/html-markup/syntax.html#comments
|
||||||
|
@ -383,11 +388,6 @@
|
||||||
};
|
};
|
||||||
*|;
|
*|;
|
||||||
|
|
||||||
action hold_start_element_head {
|
|
||||||
fhold;
|
|
||||||
fnext element_head;
|
|
||||||
}
|
|
||||||
|
|
||||||
# Characters that can be used for unquoted HTML attribute values.
|
# Characters that can be used for unquoted HTML attribute values.
|
||||||
# See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
|
# See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
|
||||||
# for more info.
|
# for more info.
|
||||||
|
@ -412,21 +412,32 @@
|
||||||
callback_simple(id_on_string_squote);
|
callback_simple(id_on_string_squote);
|
||||||
};
|
};
|
||||||
|
|
||||||
any => hold_start_element_head;
|
any => hold_and_return;
|
||||||
*|;
|
*|;
|
||||||
|
|
||||||
# Machine used for processing XML attribute values.
|
# Machine used for processing XML attribute values.
|
||||||
xml_attribute_value := |*
|
xml_attribute_value := |*
|
||||||
squote => start_string_squote;
|
# The following two actions use "fnext" instead of "fcall". Combined
|
||||||
dquote => start_string_dquote;
|
# with "element_head" using "fcall" to jump to this machine this means
|
||||||
any => hold_start_element_head;
|
# we can return back to "element_head" after processing a single string.
|
||||||
|
squote => {
|
||||||
|
callback_simple(id_on_string_squote);
|
||||||
|
|
||||||
|
fnext string_squote;
|
||||||
|
};
|
||||||
|
|
||||||
|
dquote => {
|
||||||
|
callback_simple(id_on_string_dquote);
|
||||||
|
|
||||||
|
fnext string_dquote;
|
||||||
|
};
|
||||||
|
|
||||||
|
any => hold_and_return;
|
||||||
*|;
|
*|;
|
||||||
|
|
||||||
# Machine used for processing the contents of an element's starting tag.
|
# Machine used for processing the contents of an element's starting tag.
|
||||||
# This includes the name, namespace and attributes.
|
# This includes the name, namespace and attributes.
|
||||||
element_head := |*
|
element_head := |*
|
||||||
whitespace;
|
|
||||||
|
|
||||||
newline => advance_newline;
|
newline => advance_newline;
|
||||||
|
|
||||||
# Attribute names and namespaces.
|
# Attribute names and namespaces.
|
||||||
|
@ -442,11 +453,11 @@
|
||||||
'=' => {
|
'=' => {
|
||||||
if ( html_p )
|
if ( html_p )
|
||||||
{
|
{
|
||||||
fnext html_attribute_value;
|
fcall html_attribute_value;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
fnext xml_attribute_value;
|
fcall xml_attribute_value;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -469,6 +480,8 @@
|
||||||
callback_simple(id_on_element_end);
|
callback_simple(id_on_element_end);
|
||||||
fnext main;
|
fnext main;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
any;
|
||||||
*|;
|
*|;
|
||||||
|
|
||||||
# Text
|
# Text
|
||||||
|
|
|
@ -9,6 +9,20 @@ describe Oga::XML::Lexer do
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it 'lexes an opening element with a stray double quote' do
|
||||||
|
lex('<p">').should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'p', 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'lexes an opening element with a stray double quoted string' do
|
||||||
|
lex('<p"">').should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'p', 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
it 'lexes an opening an closing element' do
|
it 'lexes an opening an closing element' do
|
||||||
lex('<p></p>').should == [
|
lex('<p></p>').should == [
|
||||||
[:T_ELEM_START, nil, 1],
|
[:T_ELEM_START, nil, 1],
|
||||||
|
@ -17,6 +31,22 @@ describe Oga::XML::Lexer do
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it 'lexes an opening an closing element with a stray double quote' do
|
||||||
|
lex('<p"></p>').should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'p', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'lexes an opening an closing element with a stray double quoted string' do
|
||||||
|
lex('<p""></p>').should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'p', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
it 'lexes a paragraph element with text inside it' do
|
it 'lexes a paragraph element with text inside it' do
|
||||||
lex('<p>Hello</p>').should == [
|
lex('<p>Hello</p>').should == [
|
||||||
[:T_ELEM_START, nil, 1],
|
[:T_ELEM_START, nil, 1],
|
||||||
|
@ -61,6 +91,15 @@ describe Oga::XML::Lexer do
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it 'lexes an element with an empty attribute followed by a stray double quote' do
|
||||||
|
lex('<p foo"></p>').should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'p', 1],
|
||||||
|
[:T_ATTR, 'foo', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
it 'lexes an element with an attribute with an empty value' do
|
it 'lexes an element with an attribute with an empty value' do
|
||||||
lex('<p foo=""></p>').should == [
|
lex('<p foo=""></p>').should == [
|
||||||
[:T_ELEM_START, nil, 1],
|
[:T_ELEM_START, nil, 1],
|
||||||
|
@ -72,6 +111,28 @@ describe Oga::XML::Lexer do
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it 'lexes an attribute value followed by a stray double quote' do
|
||||||
|
lex('<p foo="""></p>').should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'p', 1],
|
||||||
|
[:T_ATTR, 'foo', 1],
|
||||||
|
[:T_STRING_DQUOTE, nil, 1],
|
||||||
|
[:T_STRING_DQUOTE, nil, 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'lexes an attribute value followed by a stray single quote' do
|
||||||
|
lex('<p foo=""\'></p>').should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'p', 1],
|
||||||
|
[:T_ATTR, 'foo', 1],
|
||||||
|
[:T_STRING_DQUOTE, nil, 1],
|
||||||
|
[:T_STRING_DQUOTE, nil, 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
it 'lexes a paragraph element with attributes' do
|
it 'lexes a paragraph element with attributes' do
|
||||||
lex('<p class="foo">Hello</p>').should == [
|
lex('<p class="foo">Hello</p>').should == [
|
||||||
[:T_ELEM_START, nil, 1],
|
[:T_ELEM_START, nil, 1],
|
||||||
|
|
Loading…
Reference in New Issue