diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb index 4b05b29..7038275 100644 --- a/lib/oga/xml/lexer.rb +++ b/lib/oga/xml/lexer.rb @@ -162,6 +162,20 @@ module Oga return !!html end + ## + # @return [TrueClass|FalseClass] + # + def html_script? + return html? && current_element == HTML_SCRIPT + end + + ## + # @return [TrueClass|FalseClass] + # + def html_style? + return html? && current_element == HTML_STYLE + end + private ## @@ -194,20 +208,6 @@ module Oga return @elements.last end - ## - # @return [TrueClass|FalseClass] - # - def html_script? - return html? && current_element == HTML_SCRIPT - end - - ## - # @return [TrueClass|FalseClass] - # - def html_style? - return html? && current_element == HTML_STYLE - end - ## # Called when processing a single quote. # diff --git a/lib/oga/xml/sax_parser.rb b/lib/oga/xml/sax_parser.rb index 37bc910..2d4033b 100644 --- a/lib/oga/xml/sax_parser.rb +++ b/lib/oga/xml/sax_parser.rb @@ -128,6 +128,10 @@ module Oga key = ns ? "#{ns}:#{name}" : name + if value + value = EntityDecoder.try_decode(value, @lexer.html?) + end + return {key => value} end @@ -152,8 +156,30 @@ module Oga return merged end + ## + # @param [String] text + # + def on_text(text) + if @handler.respond_to?(:on_text) + unless inside_literal_html? + text = EntityDecoder.try_decode(text, @lexer.html?) + end + + run_callback(:on_text, text) + end + + return + end + private + ## + # @return [TrueClass|FalseClass] + # + def inside_literal_html? + return @lexer.html_script? || @lexer.html_style? + end + ## # @param [Symbol] method # @param [Array] args diff --git a/spec/oga/xml/sax_parser_spec.rb b/spec/oga/xml/sax_parser_spec.rb index e41baac..57deb46 100644 --- a/spec/oga/xml/sax_parser_spec.rb +++ b/spec/oga/xml/sax_parser_spec.rb @@ -58,6 +58,42 @@ describe Oga::XML::SaxParser do handler.attrs.should == {'b' => '10', 'x:c' => '20'} end + + describe 'when parsing XML documents' do + it 'decodes XML entities in text nodes' do + handler = Class.new do + attr_reader :text + + def on_text(text) + @text = text + end + end.new + + parser = described_class.new(handler, '<') + + parser.parse + + handler.text.should == '<' + end + end + + describe 'when parsing HTML documents' do + it 'decodes HTML entities in text nodes' do + handler = Class.new do + attr_reader :text + + def on_text(text) + @text = text + end + end.new + + parser = described_class.new(handler, ' ', :html => true) + + parser.parse + + handler.text.should == Oga::HTML::Entities::DECODE_MAPPING[' '] + end + end end describe '#on_attribute' do @@ -84,6 +120,29 @@ describe Oga::XML::SaxParser do hash.should == {'FOO' => 'bar'} end + + describe 'when parsing an XML document' do + it 'decodes XML entities' do + parser = described_class.new(@handler_without, '') + hash = parser.on_attribute('a', nil, '<') + + hash.should == {'a' => '<'} + end + end + + describe 'when parsing an HTML document' do + it 'decodes HTML entities' do + parser = described_class.new( + @handler_without, + '', + :html => true + ) + + hash = parser.on_attribute('a', nil, ' ') + + hash.should == {'a' => Oga::HTML::Entities::DECODE_MAPPING[' ']} + end + end end describe '#on_attributes' do @@ -111,4 +170,79 @@ describe Oga::XML::SaxParser do retval.should == %w{Alice Bob} end end + + describe '#on_text' do + it 'invokes a custom on_text callback if defined' do + handler = Class.new do + attr_reader :text + + def on_text(text) + @text = text.upcase + end + end.new + + parser = described_class.new(handler, nil) + + parser.on_text('foo') + + handler.text.should == 'FOO' + end + + describe 'when parsing an XML document' do + before do + @handler = Class.new do + attr_reader :text + + def on_text(text) + @text = text + end + end.new + + @parser = described_class.new(@handler, nil) + end + + it 'decodes XML entities' do + @parser.on_text('<') + + @handler.text.should == '<' + end + end + + describe 'when parsing an HTML document' do + before do + @handler = Class.new do + attr_reader :text + + def on_text(text) + @text = text + end + end.new + + @parser = described_class.new(@handler, nil, :html => true) + end + + it 'decodes HTML entities' do + @parser.on_text(' ') + + @handler.text.should == + Oga::HTML::Entities::DECODE_MAPPING[' '] + end + + it 'does not decode HTML entities of script tags' do + @parser.stub(:inside_literal_html?).and_return(true) + + @parser.on_text(' ') + + @handler.text.should == ' ' + end + + it 'does not decode HTML entities of style tags' do + @parser.stub(:inside_literal_html?).and_return(true) + + @parser.on_text(' ') + + @handler.text.should == ' ' + end + end + end end