Handle lexing of stray quotes in element heads

This adds lexing support for HTML/XML such as:

    <foo bar="""></foo>

While technically invalid, some websites (e.g. yahoo.com) contain HTML
just like this.

The lexer handles this as following:

1. When we're in the "element_head" machine, do business as usual until
   we bump into a "=".

2. Call (using Ragel's "fcall") the machine to use for processing the
   attribute value (if any).

3. In this machine quoted strings are processed. The moment a string has
   been processed the lexer jumps right back in to the "element_head"
   machine. This ensures that any stray quotes are ignored instead of
   being processed as extra attribute values (eventually leading to
   parsing errors due to unbalanced quotes).
This commit is contained in:
Yorick Peterse 2015-04-15 22:33:53 +02:00
parent 9a0e31d0ae
commit 6b779d7883
2 changed files with 87 additions and 13 deletions

View File

@ -61,6 +61,11 @@
advance_line(1)
}
action hold_and_return {
fhold;
fret;
}
# Comments
#
# http://www.w3.org/TR/html-markup/syntax.html#comments
@ -383,11 +388,6 @@
};
*|;
action hold_start_element_head {
fhold;
fnext element_head;
}
# Characters that can be used for unquoted HTML attribute values.
# See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
# for more info.
@ -412,21 +412,32 @@
callback_simple(id_on_string_squote);
};
any => hold_start_element_head;
any => hold_and_return;
*|;
# Machine used for processing XML attribute values.
xml_attribute_value := |*
squote => start_string_squote;
dquote => start_string_dquote;
any => hold_start_element_head;
# The following two actions use "fnext" instead of "fcall". Combined
# with "element_head" using "fcall" to jump to this machine this means
# we can return back to "element_head" after processing a single string.
squote => {
callback_simple(id_on_string_squote);
fnext string_squote;
};
dquote => {
callback_simple(id_on_string_dquote);
fnext string_dquote;
};
any => hold_and_return;
*|;
# Machine used for processing the contents of an element's starting tag.
# This includes the name, namespace and attributes.
element_head := |*
whitespace;
newline => advance_newline;
# Attribute names and namespaces.
@ -442,11 +453,11 @@
'=' => {
if ( html_p )
{
fnext html_attribute_value;
fcall html_attribute_value;
}
else
{
fnext xml_attribute_value;
fcall xml_attribute_value;
}
};
@ -469,6 +480,8 @@
callback_simple(id_on_element_end);
fnext main;
};
any;
*|;
# Text

View File

@ -9,6 +9,20 @@ describe Oga::XML::Lexer do
]
end
it 'lexes an opening element with a stray double quote' do
lex('<p">').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'p', 1]
]
end
it 'lexes an opening element with a stray double quoted string' do
lex('<p"">').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'p', 1]
]
end
it 'lexes an opening an closing element' do
lex('<p></p>').should == [
[:T_ELEM_START, nil, 1],
@ -17,6 +31,22 @@ describe Oga::XML::Lexer do
]
end
it 'lexes an opening an closing element with a stray double quote' do
lex('<p"></p>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'p', 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes an opening an closing element with a stray double quoted string' do
lex('<p""></p>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'p', 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes a paragraph element with text inside it' do
lex('<p>Hello</p>').should == [
[:T_ELEM_START, nil, 1],
@ -61,6 +91,15 @@ describe Oga::XML::Lexer do
]
end
it 'lexes an element with an empty attribute followed by a stray double quote' do
lex('<p foo"></p>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'p', 1],
[:T_ATTR, 'foo', 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes an element with an attribute with an empty value' do
lex('<p foo=""></p>').should == [
[:T_ELEM_START, nil, 1],
@ -72,6 +111,28 @@ describe Oga::XML::Lexer do
]
end
it 'lexes an attribute value followed by a stray double quote' do
lex('<p foo="""></p>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'p', 1],
[:T_ATTR, 'foo', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes an attribute value followed by a stray single quote' do
lex('<p foo=""\'></p>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'p', 1],
[:T_ATTR, 'foo', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes a paragraph element with attributes' do
lex('<p class="foo">Hello</p>').should == [
[:T_ELEM_START, nil, 1],