From 6b779d788384b89ba30ef60c17a156216ba5b333 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Wed, 15 Apr 2015 22:33:53 +0200 Subject: [PATCH] Handle lexing of stray quotes in element heads This adds lexing support for HTML/XML such as: While technically invalid, some websites (e.g. yahoo.com) contain HTML just like this. The lexer handles this as following: 1. When we're in the "element_head" machine, do business as usual until we bump into a "=". 2. Call (using Ragel's "fcall") the machine to use for processing the attribute value (if any). 3. In this machine quoted strings are processed. The moment a string has been processed the lexer jumps right back in to the "element_head" machine. This ensures that any stray quotes are ignored instead of being processed as extra attribute values (eventually leading to parsing errors due to unbalanced quotes). --- ext/ragel/base_lexer.rl | 39 ++++++++++++------ spec/oga/xml/lexer/elements_spec.rb | 61 +++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 13 deletions(-) diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index dca0eff..b2331de 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -61,6 +61,11 @@ advance_line(1) } + action hold_and_return { + fhold; + fret; + } + # Comments # # http://www.w3.org/TR/html-markup/syntax.html#comments @@ -383,11 +388,6 @@ }; *|; - action hold_start_element_head { - fhold; - fnext element_head; - } - # Characters that can be used for unquoted HTML attribute values. # See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example # for more info. @@ -412,21 +412,32 @@ callback_simple(id_on_string_squote); }; - any => hold_start_element_head; + any => hold_and_return; *|; # Machine used for processing XML attribute values. xml_attribute_value := |* - squote => start_string_squote; - dquote => start_string_dquote; - any => hold_start_element_head; + # The following two actions use "fnext" instead of "fcall". Combined + # with "element_head" using "fcall" to jump to this machine this means + # we can return back to "element_head" after processing a single string. + squote => { + callback_simple(id_on_string_squote); + + fnext string_squote; + }; + + dquote => { + callback_simple(id_on_string_dquote); + + fnext string_dquote; + }; + + any => hold_and_return; *|; # Machine used for processing the contents of an element's starting tag. # This includes the name, namespace and attributes. element_head := |* - whitespace; - newline => advance_newline; # Attribute names and namespaces. @@ -442,11 +453,11 @@ '=' => { if ( html_p ) { - fnext html_attribute_value; + fcall html_attribute_value; } else { - fnext xml_attribute_value; + fcall xml_attribute_value; } }; @@ -469,6 +480,8 @@ callback_simple(id_on_element_end); fnext main; }; + + any; *|; # Text diff --git a/spec/oga/xml/lexer/elements_spec.rb b/spec/oga/xml/lexer/elements_spec.rb index 3ed9c7f..f5d4851 100644 --- a/spec/oga/xml/lexer/elements_spec.rb +++ b/spec/oga/xml/lexer/elements_spec.rb @@ -9,6 +9,20 @@ describe Oga::XML::Lexer do ] end + it 'lexes an opening element with a stray double quote' do + lex('').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'p', 1] + ] + end + + it 'lexes an opening element with a stray double quoted string' do + lex('').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'p', 1] + ] + end + it 'lexes an opening an closing element' do lex('

').should == [ [:T_ELEM_START, nil, 1], @@ -17,6 +31,22 @@ describe Oga::XML::Lexer do ] end + it 'lexes an opening an closing element with a stray double quote' do + lex('

').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'p', 1], + [:T_ELEM_END, nil, 1] + ] + end + + it 'lexes an opening an closing element with a stray double quoted string' do + lex('

').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'p', 1], + [:T_ELEM_END, nil, 1] + ] + end + it 'lexes a paragraph element with text inside it' do lex('

Hello

').should == [ [:T_ELEM_START, nil, 1], @@ -61,6 +91,15 @@ describe Oga::XML::Lexer do ] end + it 'lexes an element with an empty attribute followed by a stray double quote' do + lex('

').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'p', 1], + [:T_ATTR, 'foo', 1], + [:T_ELEM_END, nil, 1] + ] + end + it 'lexes an element with an attribute with an empty value' do lex('

').should == [ [:T_ELEM_START, nil, 1], @@ -72,6 +111,28 @@ describe Oga::XML::Lexer do ] end + it 'lexes an attribute value followed by a stray double quote' do + lex('

').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'p', 1], + [:T_ATTR, 'foo', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_ELEM_END, nil, 1] + ] + end + + it 'lexes an attribute value followed by a stray single quote' do + lex('

').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'p', 1], + [:T_ATTR, 'foo', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_ELEM_END, nil, 1] + ] + end + it 'lexes a paragraph element with attributes' do lex('

Hello

').should == [ [:T_ELEM_START, nil, 1],