diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index 8606e64..2a6143a 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -52,11 +52,6 @@ if ( fc == '\n' ) lines++; } - action hold_and_return { - fhold; - fret; - } - whitespace = [ \t]; ident_char = [a-zA-Z0-9\-_]; identifier = ident_char+; @@ -375,6 +370,11 @@ }; *|; + action hold_start_element_head { + fhold; + fnext element_head; + } + # Characters that can be used for unquoted HTML attribute values. # See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example # for more info. @@ -384,8 +384,10 @@ # Machine used for processing HTML attribute values. html_attribute_value := |* - squote => start_string_squote; - dquote => start_string_dquote; + squote | dquote => { + fhold; + fnext html_attribute_value_quoted; + }; # Unquoted attribute values are lexed as if they were single quoted # strings. @@ -397,14 +399,23 @@ callback_simple(id_on_string_squote); }; - any => hold_and_return; + any => hold_start_element_head; + *|; + + # Machine specifically used when dealing with quoted HTML attributes. This + # ensures that input such as doesn't result in "/" being + # considered part of the attribute value. + html_attribute_value_quoted := |* + squote => start_string_squote; + dquote => start_string_dquote; + any => hold_start_element_head; *|; # Machine used for processing XML attribute values. xml_attribute_value := |* squote => start_string_squote; dquote => start_string_dquote; - any => hold_and_return; + any => hold_start_element_head; *|; # Machine used for processing the contents of an element's starting tag. @@ -429,11 +440,11 @@ '=' => { if ( html_p ) { - fcall html_attribute_value; + fnext html_attribute_value; } else { - fcall xml_attribute_value; + fnext xml_attribute_value; } }; diff --git a/spec/oga/xml/lexer/elements_spec.rb b/spec/oga/xml/lexer/elements_spec.rb index 2450232..3ed9c7f 100644 --- a/spec/oga/xml/lexer/elements_spec.rb +++ b/spec/oga/xml/lexer/elements_spec.rb @@ -171,6 +171,28 @@ describe Oga::XML::Lexer do [:T_ELEM_END, nil, 1] ] end + + describe 'without a space before the closing tag' do + it 'lexes a void element' do + lex('
').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'br', 1], + [:T_ELEM_END, nil, 1] + ] + end + + it 'lexes a void element with an attribute' do + lex('
').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'br', 1], + [:T_ATTR, 'class', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, 'foo', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_ELEM_END, nil, 1] + ] + end + end end describe 'elements with namespaces' do diff --git a/spec/oga/xml/lexer/html_attributes_spec.rb b/spec/oga/xml/lexer/html_attributes_spec.rb index 41b79d4..f6b53fe 100644 --- a/spec/oga/xml/lexer/html_attributes_spec.rb +++ b/spec/oga/xml/lexer/html_attributes_spec.rb @@ -50,5 +50,17 @@ describe Oga::XML::Lexer do [:T_ELEM_END, nil, 1] ] end + + it 'lexes an attribute with an unquoted value containing a slash' do + lex_html('
').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'a', 1], + [:T_ATTR, 'href', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_STRING_BODY, 'foo/', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_ELEM_END, nil, 1] + ] + end end end diff --git a/spec/oga/xml/lexer/html_void_elements_spec.rb b/spec/oga/xml/lexer/html_void_elements_spec.rb index cdd89f7..a0a07f2 100644 --- a/spec/oga/xml/lexer/html_void_elements_spec.rb +++ b/spec/oga/xml/lexer/html_void_elements_spec.rb @@ -3,7 +3,7 @@ require 'spec_helper' describe Oga::XML::Lexer do describe 'HTML void elements' do it 'lexes a void element that omits the closing /' do - lex('', :html => true).should == [ + lex_html('').should == [ [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'link', 1], [:T_ELEM_END, nil, 1] @@ -11,7 +11,7 @@ describe Oga::XML::Lexer do end it 'lexes a upper case void element' do - lex('
', :html => true).should == [ + lex_html('
').should == [ [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, "BR", 1], [:T_ELEM_END, nil, 1] @@ -19,7 +19,7 @@ describe Oga::XML::Lexer do end it 'lexes text after a void element' do - lex('foo', :html => true).should == [ + lex_html('foo').should == [ [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'link', 1], [:T_ELEM_END, nil, 1], @@ -28,7 +28,7 @@ describe Oga::XML::Lexer do end it 'lexes a void element inside another element' do - lex('', :html => true).should == [ + lex_html('').should == [ [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'head', 1], [:T_ELEM_START, nil, 1], @@ -39,7 +39,7 @@ describe Oga::XML::Lexer do end it 'lexes a void element inside another element with whitespace' do - lex("\n", :html => true).should == [ + lex_html("\n").should == [ [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'head', 1], [:T_ELEM_START, nil, 1], @@ -49,5 +49,51 @@ describe Oga::XML::Lexer do [:T_ELEM_END, nil, 2] ] end + + it 'lexes a void element with an unquoted attribute value' do + lex_html('
').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'br', 1], + [:T_ATTR, 'class', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_STRING_BODY, 'foo', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_ELEM_END, nil, 1] + ] + end + + describe 'without a space before the closing tag' do + it 'lexes a void element' do + lex_html('
').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'br', 1], + [:T_ELEM_END, nil, 1] + ] + end + + it 'lexes a void element with an attribute' do + lex_html('
').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'br', 1], + [:T_ATTR, 'class', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, 'foo', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_ELEM_END, nil, 1] + ] + end + + it 'lexes a void element with an unquoted attribute value' do + lex_html('
').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'br', 1], + [:T_ATTR, 'class', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_STRING_BODY, 'foo/', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_ELEM_END, nil, 1] + ] + end + end end end