Decode XML/HTML entities in the SAX parser

This was broken when decoding was moved out of the Lexer class into
XML::Text and XML::Attribute.

Fixes #92
This commit is contained in:
Yorick Peterse 2015-04-18 22:03:44 +02:00
parent 611beb78c7
commit da62fcd75d
3 changed files with 174 additions and 14 deletions

View File

@ -162,6 +162,20 @@ module Oga
return !!html return !!html
end end
##
# @return [TrueClass|FalseClass]
#
def html_script?
return html? && current_element == HTML_SCRIPT
end
##
# @return [TrueClass|FalseClass]
#
def html_style?
return html? && current_element == HTML_STYLE
end
private private
## ##
@ -194,20 +208,6 @@ module Oga
return @elements.last return @elements.last
end end
##
# @return [TrueClass|FalseClass]
#
def html_script?
return html? && current_element == HTML_SCRIPT
end
##
# @return [TrueClass|FalseClass]
#
def html_style?
return html? && current_element == HTML_STYLE
end
## ##
# Called when processing a single quote. # Called when processing a single quote.
# #

View File

@ -128,6 +128,10 @@ module Oga
key = ns ? "#{ns}:#{name}" : name key = ns ? "#{ns}:#{name}" : name
if value
value = EntityDecoder.try_decode(value, @lexer.html?)
end
return {key => value} return {key => value}
end end
@ -152,8 +156,30 @@ module Oga
return merged return merged
end end
##
# @param [String] text
#
def on_text(text)
if @handler.respond_to?(:on_text)
unless inside_literal_html?
text = EntityDecoder.try_decode(text, @lexer.html?)
end
run_callback(:on_text, text)
end
return
end
private private
##
# @return [TrueClass|FalseClass]
#
def inside_literal_html?
return @lexer.html_script? || @lexer.html_style?
end
## ##
# @param [Symbol] method # @param [Symbol] method
# @param [Array] args # @param [Array] args

View File

@ -58,6 +58,42 @@ describe Oga::XML::SaxParser do
handler.attrs.should == {'b' => '10', 'x:c' => '20'} handler.attrs.should == {'b' => '10', 'x:c' => '20'}
end end
describe 'when parsing XML documents' do
it 'decodes XML entities in text nodes' do
handler = Class.new do
attr_reader :text
def on_text(text)
@text = text
end
end.new
parser = described_class.new(handler, '<')
parser.parse
handler.text.should == '<'
end
end
describe 'when parsing HTML documents' do
it 'decodes HTML entities in text nodes' do
handler = Class.new do
attr_reader :text
def on_text(text)
@text = text
end
end.new
parser = described_class.new(handler, '&nbsp;', :html => true)
parser.parse
handler.text.should == Oga::HTML::Entities::DECODE_MAPPING['&nbsp;']
end
end
end end
describe '#on_attribute' do describe '#on_attribute' do
@ -84,6 +120,29 @@ describe Oga::XML::SaxParser do
hash.should == {'FOO' => 'bar'} hash.should == {'FOO' => 'bar'}
end end
describe 'when parsing an XML document' do
it 'decodes XML entities' do
parser = described_class.new(@handler_without, '<a a="&lt;" />')
hash = parser.on_attribute('a', nil, '&lt;')
hash.should == {'a' => '<'}
end
end
describe 'when parsing an HTML document' do
it 'decodes HTML entities' do
parser = described_class.new(
@handler_without,
'<a a="&nbsp;" />',
:html => true
)
hash = parser.on_attribute('a', nil, '&nbsp;')
hash.should == {'a' => Oga::HTML::Entities::DECODE_MAPPING['&nbsp;']}
end
end
end end
describe '#on_attributes' do describe '#on_attributes' do
@ -111,4 +170,79 @@ describe Oga::XML::SaxParser do
retval.should == %w{Alice Bob} retval.should == %w{Alice Bob}
end end
end end
describe '#on_text' do
it 'invokes a custom on_text callback if defined' do
handler = Class.new do
attr_reader :text
def on_text(text)
@text = text.upcase
end
end.new
parser = described_class.new(handler, nil)
parser.on_text('foo')
handler.text.should == 'FOO'
end
describe 'when parsing an XML document' do
before do
@handler = Class.new do
attr_reader :text
def on_text(text)
@text = text
end
end.new
@parser = described_class.new(@handler, nil)
end
it 'decodes XML entities' do
@parser.on_text('&lt;')
@handler.text.should == '<'
end
end
describe 'when parsing an HTML document' do
before do
@handler = Class.new do
attr_reader :text
def on_text(text)
@text = text
end
end.new
@parser = described_class.new(@handler, nil, :html => true)
end
it 'decodes HTML entities' do
@parser.on_text('&nbsp;')
@handler.text.should ==
Oga::HTML::Entities::DECODE_MAPPING['&nbsp;']
end
it 'does not decode HTML entities of script tags' do
@parser.stub(:inside_literal_html?).and_return(true)
@parser.on_text('&nbsp;')
@handler.text.should == '&nbsp;'
end
it 'does not decode HTML entities of style tags' do
@parser.stub(:inside_literal_html?).and_return(true)
@parser.on_text('&nbsp;')
@handler.text.should == '&nbsp;'
end
end
end
end end