Fixes #118 - decoding invalid entities.
Previous regular expression was too greedy in terms of matching letters from outside of A-F hex scope, and matching letters when not in hex mode.
This commit is contained in:
parent
8990a62224
commit
6fc3ef425b
|
@ -63,7 +63,7 @@ module Oga
|
|||
#
|
||||
# @return [Regexp]
|
||||
#
|
||||
CODEPOINT_ENTITY = /&#(x)?([a-zA-Z0-9]+);/
|
||||
CODEPOINT_ENTITY = /&#(x[a-fA-F0-9]+|\d+);/
|
||||
|
||||
##
|
||||
# @return [Regexp]
|
||||
|
@ -90,7 +90,7 @@ module Oga
|
|||
|
||||
if input.include?(AMPERSAND)
|
||||
input = input.gsub(CODEPOINT_ENTITY) do |match|
|
||||
[$1 ? Integer($2, 16) : Integer($2, 10)].pack('U*')
|
||||
[$1.start_with?('x') ? Integer($1[1..-1], 16) : Integer($1, 10)].pack('U*')
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -73,6 +73,18 @@ describe Oga::XML::Entities do
|
|||
it 'decodes numeric entities starting with a 0' do
|
||||
described_class.decode('&').should == '&'
|
||||
end
|
||||
|
||||
it 'preserves entity-like tokens' do
|
||||
described_class.decode('&#TAB;').should == '&#TAB;'
|
||||
end
|
||||
|
||||
it 'preserves entity-like hex tokens' do
|
||||
described_class.decode('&#x;').should == '&#x;'
|
||||
end
|
||||
|
||||
it 'preserves entity-like letters in non-hex mode' do
|
||||
described_class.decode('{A;').should == '{A;'
|
||||
end
|
||||
end
|
||||
|
||||
describe 'encode' do
|
||||
|
|
Loading…
Reference in New Issue