From 66fc4b1dfcc4b651302c7582f62287d5750dcbfe Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Sat, 26 Dec 2015 20:28:35 +0100 Subject: [PATCH] Fixed parsing HTML identifiers containing colons HTML identifiers containing colons should be treated in two ways: * For element names the prefix (= the namespace prefix in case of XML) should be ignored as HTML doesn't support/use namespaces. * For attribute names a colon is a valid character, thus "foo:bar:baz" should be treated as a single attribute name. This fixes #142. --- ext/ragel/base_lexer.rl | 79 +++++++++++++++++++++----- lib/oga/xml/element.rb | 6 +- spec/oga/html/lexer/attributes_spec.rb | 11 ++++ spec/oga/html/lexer/elements_spec.rb | 12 ++++ spec/oga/xml/element_spec.rb | 10 ++++ 5 files changed, 104 insertions(+), 14 deletions(-) create mode 100644 spec/oga/html/lexer/elements_spec.rb diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index e41b40e..d5f8070 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -53,6 +53,9 @@ ident_char = unicode | [a-zA-Z0-9\-_\.]; identifier = ident_char+; + html_ident_char = unicode | [a-zA-Z0-9\-_\.:]; + html_identifier = html_ident_char+; + whitespace_or_newline = whitespace | newline; action count_newlines { @@ -390,12 +393,23 @@ # Machine used for lexing the name/namespace of an element. element_name := |* identifier ':' => { - callback(id_on_element_ns, data, encoding, ts, te - 1); + if ( !html_p ) + { + callback(id_on_element_ns, data, encoding, ts, te - 1); + } }; identifier => { callback(id_on_element_name, data, encoding, ts, te); - fnext element_head; + + if ( html_p ) + { + fnext html_element_head; + } + else + { + fnext element_head; + } }; *|; @@ -508,8 +522,33 @@ any => hold_and_return; *|; - # Machine used for processing the contents of an element's starting tag. - # This includes the name, namespace and attributes. + action start_attribute_pre { + fcall attribute_pre; + } + + action close_open_element { + callback_simple(id_on_element_open_end); + + if ( html_script_p() ) + { + fnext html_script; + } + else if ( html_style_p() ) + { + fnext html_style; + } + else + { + fnext main; + } + } + + action close_self_closing_element { + callback_simple(id_on_element_end); + fnext main; + } + + # Machine used for processing the contents of an XML element's starting tag. element_head := |* newline => advance_newline; @@ -522,12 +561,30 @@ callback(id_on_attribute, data, encoding, ts, te); }; - # Attribute values. - '=' => { - fcall attribute_pre; + '=' => start_attribute_pre; + + '>' => { + callback_simple(id_on_element_open_end); + + fnext main; }; - # We're done with the open tag of the element. + '/>' => close_self_closing_element; + + any; + *|; + + # Machine used for processing the contents of an HTML element's starting + # tag. + html_element_head := |* + newline => advance_newline; + + html_identifier => { + callback(id_on_attribute, data, encoding, ts, te); + }; + + '=' => start_attribute_pre; + '>' => { callback_simple(id_on_element_open_end); @@ -545,11 +602,7 @@ } }; - # Self closing tags. - '/>' => { - callback_simple(id_on_element_end); - fnext main; - }; + '/>' => close_self_closing_element; any; *|; diff --git a/lib/oga/xml/element.rb b/lib/oga/xml/element.rb index 02bda03..c513f3c 100644 --- a/lib/oga/xml/element.rb +++ b/lib/oga/xml/element.rb @@ -64,7 +64,11 @@ module Oga # # @return [Oga::XML::Attribute] def attribute(name) - name, ns = split_name(name) + if html? + ns = nil + else + name, ns = split_name(name) + end attributes.each do |attr| return attr if attribute_matches?(attr, ns, name) diff --git a/spec/oga/html/lexer/attributes_spec.rb b/spec/oga/html/lexer/attributes_spec.rb index e31c525..7773dd6 100644 --- a/spec/oga/html/lexer/attributes_spec.rb +++ b/spec/oga/html/lexer/attributes_spec.rb @@ -138,5 +138,16 @@ describe Oga::XML::Lexer do [:T_ELEM_END, nil, 2] ] end + + it 'lexes an element containing a namespaced attribute' do + lex_html('').should == [ + [:T_ELEM_NAME, 'foo', 1], + [:T_ATTR, 'bar:baz', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, '10', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_ELEM_END, nil, 1] + ] + end end end diff --git a/spec/oga/html/lexer/elements_spec.rb b/spec/oga/html/lexer/elements_spec.rb new file mode 100644 index 0000000..3988e8c --- /dev/null +++ b/spec/oga/html/lexer/elements_spec.rb @@ -0,0 +1,12 @@ +require 'spec_helper' + +describe Oga::XML::Lexer do + describe 'HTML elements' do + it 'lexes an element containing an element namespace' do + lex_html('').should == [ + [:T_ELEM_NAME, 'bar', 1], + [:T_ELEM_END, nil, 1] + ] + end + end +end diff --git a/spec/oga/xml/element_spec.rb b/spec/oga/xml/element_spec.rb index 286a1d4..d43c6e5 100644 --- a/spec/oga/xml/element_spec.rb +++ b/spec/oga/xml/element_spec.rb @@ -105,6 +105,16 @@ describe Oga::XML::Element do it 'returns nil if an attribute has a namespace that is not given' do @instance.attribute('bar').nil?.should == true end + + describe 'using an HTML document' do + it 'returns an attribute containing a namespace separator' do + attr = Oga::XML::Attribute.new(:name => 'foo:bar', :value => 'foo') + el = described_class.new(:name => 'foo', :attributes => [attr]) + doc = Oga::XML::Document.new(:children => [el], :type => :html) + + el.attribute('foo:bar').should == attr + end + end end describe '#get' do