diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb
index 4b05b29..7038275 100644
--- a/lib/oga/xml/lexer.rb
+++ b/lib/oga/xml/lexer.rb
@@ -162,6 +162,20 @@ module Oga
return !!html
end
+ ##
+ # @return [TrueClass|FalseClass]
+ #
+ def html_script?
+ return html? && current_element == HTML_SCRIPT
+ end
+
+ ##
+ # @return [TrueClass|FalseClass]
+ #
+ def html_style?
+ return html? && current_element == HTML_STYLE
+ end
+
private
##
@@ -194,20 +208,6 @@ module Oga
return @elements.last
end
- ##
- # @return [TrueClass|FalseClass]
- #
- def html_script?
- return html? && current_element == HTML_SCRIPT
- end
-
- ##
- # @return [TrueClass|FalseClass]
- #
- def html_style?
- return html? && current_element == HTML_STYLE
- end
-
##
# Called when processing a single quote.
#
diff --git a/lib/oga/xml/sax_parser.rb b/lib/oga/xml/sax_parser.rb
index 37bc910..2d4033b 100644
--- a/lib/oga/xml/sax_parser.rb
+++ b/lib/oga/xml/sax_parser.rb
@@ -128,6 +128,10 @@ module Oga
key = ns ? "#{ns}:#{name}" : name
+ if value
+ value = EntityDecoder.try_decode(value, @lexer.html?)
+ end
+
return {key => value}
end
@@ -152,8 +156,30 @@ module Oga
return merged
end
+ ##
+ # @param [String] text
+ #
+ def on_text(text)
+ if @handler.respond_to?(:on_text)
+ unless inside_literal_html?
+ text = EntityDecoder.try_decode(text, @lexer.html?)
+ end
+
+ run_callback(:on_text, text)
+ end
+
+ return
+ end
+
private
+ ##
+ # @return [TrueClass|FalseClass]
+ #
+ def inside_literal_html?
+ return @lexer.html_script? || @lexer.html_style?
+ end
+
##
# @param [Symbol] method
# @param [Array] args
diff --git a/spec/oga/xml/sax_parser_spec.rb b/spec/oga/xml/sax_parser_spec.rb
index e41baac..57deb46 100644
--- a/spec/oga/xml/sax_parser_spec.rb
+++ b/spec/oga/xml/sax_parser_spec.rb
@@ -58,6 +58,42 @@ describe Oga::XML::SaxParser do
handler.attrs.should == {'b' => '10', 'x:c' => '20'}
end
+
+ describe 'when parsing XML documents' do
+ it 'decodes XML entities in text nodes' do
+ handler = Class.new do
+ attr_reader :text
+
+ def on_text(text)
+ @text = text
+ end
+ end.new
+
+ parser = described_class.new(handler, '<')
+
+ parser.parse
+
+ handler.text.should == '<'
+ end
+ end
+
+ describe 'when parsing HTML documents' do
+ it 'decodes HTML entities in text nodes' do
+ handler = Class.new do
+ attr_reader :text
+
+ def on_text(text)
+ @text = text
+ end
+ end.new
+
+ parser = described_class.new(handler, ' ', :html => true)
+
+ parser.parse
+
+ handler.text.should == Oga::HTML::Entities::DECODE_MAPPING[' ']
+ end
+ end
end
describe '#on_attribute' do
@@ -84,6 +120,29 @@ describe Oga::XML::SaxParser do
hash.should == {'FOO' => 'bar'}
end
+
+ describe 'when parsing an XML document' do
+ it 'decodes XML entities' do
+ parser = described_class.new(@handler_without, '')
+ hash = parser.on_attribute('a', nil, '<')
+
+ hash.should == {'a' => '<'}
+ end
+ end
+
+ describe 'when parsing an HTML document' do
+ it 'decodes HTML entities' do
+ parser = described_class.new(
+ @handler_without,
+ '',
+ :html => true
+ )
+
+ hash = parser.on_attribute('a', nil, ' ')
+
+ hash.should == {'a' => Oga::HTML::Entities::DECODE_MAPPING[' ']}
+ end
+ end
end
describe '#on_attributes' do
@@ -111,4 +170,79 @@ describe Oga::XML::SaxParser do
retval.should == %w{Alice Bob}
end
end
+
+ describe '#on_text' do
+ it 'invokes a custom on_text callback if defined' do
+ handler = Class.new do
+ attr_reader :text
+
+ def on_text(text)
+ @text = text.upcase
+ end
+ end.new
+
+ parser = described_class.new(handler, nil)
+
+ parser.on_text('foo')
+
+ handler.text.should == 'FOO'
+ end
+
+ describe 'when parsing an XML document' do
+ before do
+ @handler = Class.new do
+ attr_reader :text
+
+ def on_text(text)
+ @text = text
+ end
+ end.new
+
+ @parser = described_class.new(@handler, nil)
+ end
+
+ it 'decodes XML entities' do
+ @parser.on_text('<')
+
+ @handler.text.should == '<'
+ end
+ end
+
+ describe 'when parsing an HTML document' do
+ before do
+ @handler = Class.new do
+ attr_reader :text
+
+ def on_text(text)
+ @text = text
+ end
+ end.new
+
+ @parser = described_class.new(@handler, nil, :html => true)
+ end
+
+ it 'decodes HTML entities' do
+ @parser.on_text(' ')
+
+ @handler.text.should ==
+ Oga::HTML::Entities::DECODE_MAPPING[' ']
+ end
+
+ it 'does not decode HTML entities of script tags' do
+ @parser.stub(:inside_literal_html?).and_return(true)
+
+ @parser.on_text(' ')
+
+ @handler.text.should == ' '
+ end
+
+ it 'does not decode HTML entities of style tags' do
+ @parser.stub(:inside_literal_html?).and_return(true)
+
+ @parser.on_text(' ')
+
+ @handler.text.should == ' '
+ end
+ end
+ end
end