Fixed parsing HTML identifiers containing colons

HTML identifiers containing colons should be treated in two ways:

* For element names the prefix (= the namespace prefix in case of XML)
  should be ignored as HTML doesn't support/use namespaces.
* For attribute names a colon is a valid character, thus "foo:bar:baz"
  should be treated as a single attribute name.

This fixes #142.
This commit is contained in:
Yorick Peterse 2015-12-26 20:28:35 +01:00
parent a938f23a0e
commit 66fc4b1dfc
5 changed files with 104 additions and 14 deletions

View File

@ -53,6 +53,9 @@
ident_char = unicode | [a-zA-Z0-9\-_\.];
identifier = ident_char+;
html_ident_char = unicode | [a-zA-Z0-9\-_\.:];
html_identifier = html_ident_char+;
whitespace_or_newline = whitespace | newline;
action count_newlines {
@ -390,12 +393,23 @@
# Machine used for lexing the name/namespace of an element.
element_name := |*
identifier ':' => {
callback(id_on_element_ns, data, encoding, ts, te - 1);
if ( !html_p )
{
callback(id_on_element_ns, data, encoding, ts, te - 1);
}
};
identifier => {
callback(id_on_element_name, data, encoding, ts, te);
fnext element_head;
if ( html_p )
{
fnext html_element_head;
}
else
{
fnext element_head;
}
};
*|;
@ -508,8 +522,33 @@
any => hold_and_return;
*|;
# Machine used for processing the contents of an element's starting tag.
# This includes the name, namespace and attributes.
action start_attribute_pre {
fcall attribute_pre;
}
action close_open_element {
callback_simple(id_on_element_open_end);
if ( html_script_p() )
{
fnext html_script;
}
else if ( html_style_p() )
{
fnext html_style;
}
else
{
fnext main;
}
}
action close_self_closing_element {
callback_simple(id_on_element_end);
fnext main;
}
# Machine used for processing the contents of an XML element's starting tag.
element_head := |*
newline => advance_newline;
@ -522,12 +561,30 @@
callback(id_on_attribute, data, encoding, ts, te);
};
# Attribute values.
'=' => {
fcall attribute_pre;
'=' => start_attribute_pre;
'>' => {
callback_simple(id_on_element_open_end);
fnext main;
};
# We're done with the open tag of the element.
'/>' => close_self_closing_element;
any;
*|;
# Machine used for processing the contents of an HTML element's starting
# tag.
html_element_head := |*
newline => advance_newline;
html_identifier => {
callback(id_on_attribute, data, encoding, ts, te);
};
'=' => start_attribute_pre;
'>' => {
callback_simple(id_on_element_open_end);
@ -545,11 +602,7 @@
}
};
# Self closing tags.
'/>' => {
callback_simple(id_on_element_end);
fnext main;
};
'/>' => close_self_closing_element;
any;
*|;

View File

@ -64,7 +64,11 @@ module Oga
#
# @return [Oga::XML::Attribute]
def attribute(name)
name, ns = split_name(name)
if html?
ns = nil
else
name, ns = split_name(name)
end
attributes.each do |attr|
return attr if attribute_matches?(attr, ns, name)

View File

@ -138,5 +138,16 @@ describe Oga::XML::Lexer do
[:T_ELEM_END, nil, 2]
]
end
it 'lexes an element containing a namespaced attribute' do
lex_html('<foo bar:baz="10" />').should == [
[:T_ELEM_NAME, 'foo', 1],
[:T_ATTR, 'bar:baz', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, '10', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
end
end

View File

@ -0,0 +1,12 @@
require 'spec_helper'
describe Oga::XML::Lexer do
describe 'HTML elements' do
it 'lexes an element containing an element namespace' do
lex_html('<foo:bar />').should == [
[:T_ELEM_NAME, 'bar', 1],
[:T_ELEM_END, nil, 1]
]
end
end
end

View File

@ -105,6 +105,16 @@ describe Oga::XML::Element do
it 'returns nil if an attribute has a namespace that is not given' do
@instance.attribute('bar').nil?.should == true
end
describe 'using an HTML document' do
it 'returns an attribute containing a namespace separator' do
attr = Oga::XML::Attribute.new(:name => 'foo:bar', :value => 'foo')
el = described_class.new(:name => 'foo', :attributes => [attr])
doc = Oga::XML::Document.new(:children => [el], :type => :html)
el.attribute('foo:bar').should == attr
end
end
end
describe '#get' do