From 6fc3ef425b1c6e05f99eb1d240bbffcbbcc5581e Mon Sep 17 00:00:00 2001 From: Jakub Pawlowicz Date: Mon, 29 Jun 2015 16:30:03 +0100 Subject: [PATCH] Fixes #118 - decoding invalid entities. Previous regular expression was too greedy in terms of matching letters from outside of A-F hex scope, and matching letters when not in hex mode. --- lib/oga/xml/entities.rb | 4 ++-- spec/oga/xml/entities_spec.rb | 12 ++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/lib/oga/xml/entities.rb b/lib/oga/xml/entities.rb index e75092e..787a7c3 100644 --- a/lib/oga/xml/entities.rb +++ b/lib/oga/xml/entities.rb @@ -63,7 +63,7 @@ module Oga # # @return [Regexp] # - CODEPOINT_ENTITY = /&#(x)?([a-zA-Z0-9]+);/ + CODEPOINT_ENTITY = /&#(x[a-fA-F0-9]+|\d+);/ ## # @return [Regexp] @@ -90,7 +90,7 @@ module Oga if input.include?(AMPERSAND) input = input.gsub(CODEPOINT_ENTITY) do |match| - [$1 ? Integer($2, 16) : Integer($2, 10)].pack('U*') + [$1.start_with?('x') ? Integer($1[1..-1], 16) : Integer($1, 10)].pack('U*') end end diff --git a/spec/oga/xml/entities_spec.rb b/spec/oga/xml/entities_spec.rb index 62c0666..3a31162 100644 --- a/spec/oga/xml/entities_spec.rb +++ b/spec/oga/xml/entities_spec.rb @@ -73,6 +73,18 @@ describe Oga::XML::Entities do it 'decodes numeric entities starting with a 0' do described_class.decode('&').should == '&' end + + it 'preserves entity-like tokens' do + described_class.decode('&#TAB;').should == '&#TAB;' + end + + it 'preserves entity-like hex tokens' do + described_class.decode('&#x;').should == '&#x;' + end + + it 'preserves entity-like letters in non-hex mode' do + described_class.decode('{A;').should == '{A;' + end end describe 'encode' do