Fixed parsing HTML identifiers containing colons
HTML identifiers containing colons should be treated in two ways: * For element names the prefix (= the namespace prefix in case of XML) should be ignored as HTML doesn't support/use namespaces. * For attribute names a colon is a valid character, thus "foo:bar:baz" should be treated as a single attribute name. This fixes #142.
This commit is contained in:
parent
a938f23a0e
commit
66fc4b1dfc
|
@ -53,6 +53,9 @@
|
|||
ident_char = unicode | [a-zA-Z0-9\-_\.];
|
||||
identifier = ident_char+;
|
||||
|
||||
html_ident_char = unicode | [a-zA-Z0-9\-_\.:];
|
||||
html_identifier = html_ident_char+;
|
||||
|
||||
whitespace_or_newline = whitespace | newline;
|
||||
|
||||
action count_newlines {
|
||||
|
@ -390,12 +393,23 @@
|
|||
# Machine used for lexing the name/namespace of an element.
|
||||
element_name := |*
|
||||
identifier ':' => {
|
||||
callback(id_on_element_ns, data, encoding, ts, te - 1);
|
||||
if ( !html_p )
|
||||
{
|
||||
callback(id_on_element_ns, data, encoding, ts, te - 1);
|
||||
}
|
||||
};
|
||||
|
||||
identifier => {
|
||||
callback(id_on_element_name, data, encoding, ts, te);
|
||||
fnext element_head;
|
||||
|
||||
if ( html_p )
|
||||
{
|
||||
fnext html_element_head;
|
||||
}
|
||||
else
|
||||
{
|
||||
fnext element_head;
|
||||
}
|
||||
};
|
||||
*|;
|
||||
|
||||
|
@ -508,8 +522,33 @@
|
|||
any => hold_and_return;
|
||||
*|;
|
||||
|
||||
# Machine used for processing the contents of an element's starting tag.
|
||||
# This includes the name, namespace and attributes.
|
||||
action start_attribute_pre {
|
||||
fcall attribute_pre;
|
||||
}
|
||||
|
||||
action close_open_element {
|
||||
callback_simple(id_on_element_open_end);
|
||||
|
||||
if ( html_script_p() )
|
||||
{
|
||||
fnext html_script;
|
||||
}
|
||||
else if ( html_style_p() )
|
||||
{
|
||||
fnext html_style;
|
||||
}
|
||||
else
|
||||
{
|
||||
fnext main;
|
||||
}
|
||||
}
|
||||
|
||||
action close_self_closing_element {
|
||||
callback_simple(id_on_element_end);
|
||||
fnext main;
|
||||
}
|
||||
|
||||
# Machine used for processing the contents of an XML element's starting tag.
|
||||
element_head := |*
|
||||
newline => advance_newline;
|
||||
|
||||
|
@ -522,12 +561,30 @@
|
|||
callback(id_on_attribute, data, encoding, ts, te);
|
||||
};
|
||||
|
||||
# Attribute values.
|
||||
'=' => {
|
||||
fcall attribute_pre;
|
||||
'=' => start_attribute_pre;
|
||||
|
||||
'>' => {
|
||||
callback_simple(id_on_element_open_end);
|
||||
|
||||
fnext main;
|
||||
};
|
||||
|
||||
# We're done with the open tag of the element.
|
||||
'/>' => close_self_closing_element;
|
||||
|
||||
any;
|
||||
*|;
|
||||
|
||||
# Machine used for processing the contents of an HTML element's starting
|
||||
# tag.
|
||||
html_element_head := |*
|
||||
newline => advance_newline;
|
||||
|
||||
html_identifier => {
|
||||
callback(id_on_attribute, data, encoding, ts, te);
|
||||
};
|
||||
|
||||
'=' => start_attribute_pre;
|
||||
|
||||
'>' => {
|
||||
callback_simple(id_on_element_open_end);
|
||||
|
||||
|
@ -545,11 +602,7 @@
|
|||
}
|
||||
};
|
||||
|
||||
# Self closing tags.
|
||||
'/>' => {
|
||||
callback_simple(id_on_element_end);
|
||||
fnext main;
|
||||
};
|
||||
'/>' => close_self_closing_element;
|
||||
|
||||
any;
|
||||
*|;
|
||||
|
|
|
@ -64,7 +64,11 @@ module Oga
|
|||
#
|
||||
# @return [Oga::XML::Attribute]
|
||||
def attribute(name)
|
||||
name, ns = split_name(name)
|
||||
if html?
|
||||
ns = nil
|
||||
else
|
||||
name, ns = split_name(name)
|
||||
end
|
||||
|
||||
attributes.each do |attr|
|
||||
return attr if attribute_matches?(attr, ns, name)
|
||||
|
|
|
@ -138,5 +138,16 @@ describe Oga::XML::Lexer do
|
|||
[:T_ELEM_END, nil, 2]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes an element containing a namespaced attribute' do
|
||||
lex_html('<foo bar:baz="10" />').should == [
|
||||
[:T_ELEM_NAME, 'foo', 1],
|
||||
[:T_ATTR, 'bar:baz', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, '10', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::XML::Lexer do
|
||||
describe 'HTML elements' do
|
||||
it 'lexes an element containing an element namespace' do
|
||||
lex_html('<foo:bar />').should == [
|
||||
[:T_ELEM_NAME, 'bar', 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
end
|
||||
end
|
|
@ -105,6 +105,16 @@ describe Oga::XML::Element do
|
|||
it 'returns nil if an attribute has a namespace that is not given' do
|
||||
@instance.attribute('bar').nil?.should == true
|
||||
end
|
||||
|
||||
describe 'using an HTML document' do
|
||||
it 'returns an attribute containing a namespace separator' do
|
||||
attr = Oga::XML::Attribute.new(:name => 'foo:bar', :value => 'foo')
|
||||
el = described_class.new(:name => 'foo', :attributes => [attr])
|
||||
doc = Oga::XML::Document.new(:children => [el], :type => :html)
|
||||
|
||||
el.attribute('foo:bar').should == attr
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe '#get' do
|
||||
|
|
Loading…
Reference in New Issue