Fixes #118 - decoding invalid entities.

Previous regular expression was too greedy in terms of matching
letters from outside of A-F hex scope, and matching letters when
not in hex mode.
This commit is contained in:
Jakub Pawlowicz 2015-06-29 16:30:03 +01:00 committed by Yorick Peterse
parent 8990a62224
commit 6fc3ef425b
2 changed files with 14 additions and 2 deletions

View File

@ -63,7 +63,7 @@ module Oga
# #
# @return [Regexp] # @return [Regexp]
# #
CODEPOINT_ENTITY = /&#(x)?([a-zA-Z0-9]+);/ CODEPOINT_ENTITY = /&#(x[a-fA-F0-9]+|\d+);/
## ##
# @return [Regexp] # @return [Regexp]
@ -90,7 +90,7 @@ module Oga
if input.include?(AMPERSAND) if input.include?(AMPERSAND)
input = input.gsub(CODEPOINT_ENTITY) do |match| input = input.gsub(CODEPOINT_ENTITY) do |match|
[$1 ? Integer($2, 16) : Integer($2, 10)].pack('U*') [$1.start_with?('x') ? Integer($1[1..-1], 16) : Integer($1, 10)].pack('U*')
end end
end end

View File

@ -73,6 +73,18 @@ describe Oga::XML::Entities do
it 'decodes numeric entities starting with a 0' do it 'decodes numeric entities starting with a 0' do
described_class.decode('&').should == '&' described_class.decode('&').should == '&'
end end
it 'preserves entity-like tokens' do
described_class.decode('&#TAB;').should == '&#TAB;'
end
it 'preserves entity-like hex tokens' do
described_class.decode('&#x;').should == '&#x;'
end
it 'preserves entity-like letters in non-hex mode' do
described_class.decode('&#123A;').should == '&#123A;'
end
end end
describe 'encode' do describe 'encode' do