Fixes #118 - decoding invalid entities.

Previous regular expression was too greedy in terms of matching
letters from outside of A-F hex scope, and matching letters when
not in hex mode.
This commit is contained in:
Jakub Pawlowicz 2015-06-29 16:30:03 +01:00 committed by Yorick Peterse
parent 8990a62224
commit 6fc3ef425b
2 changed files with 14 additions and 2 deletions

View File

@ -63,7 +63,7 @@ module Oga
#
# @return [Regexp]
#
CODEPOINT_ENTITY = /&#(x)?([a-zA-Z0-9]+);/
CODEPOINT_ENTITY = /&#(x[a-fA-F0-9]+|\d+);/
##
# @return [Regexp]
@ -90,7 +90,7 @@ module Oga
if input.include?(AMPERSAND)
input = input.gsub(CODEPOINT_ENTITY) do |match|
[$1 ? Integer($2, 16) : Integer($2, 10)].pack('U*')
[$1.start_with?('x') ? Integer($1[1..-1], 16) : Integer($1, 10)].pack('U*')
end
end

View File

@ -73,6 +73,18 @@ describe Oga::XML::Entities do
it 'decodes numeric entities starting with a 0' do
described_class.decode('&').should == '&'
end
it 'preserves entity-like tokens' do
described_class.decode('&#TAB;').should == '&#TAB;'
end
it 'preserves entity-like hex tokens' do
described_class.decode('&#x;').should == '&#x;'
end
it 'preserves entity-like letters in non-hex mode' do
described_class.decode('&#123A;').should == '&#123A;'
end
end
describe 'encode' do