Preserve entities that can't be decoded

Certain entities when decoded will produce a String with an invalid
encoding. This commit ensures that instead of raising an EncodingError
further down the line (e.g. when calling "inspect" on a document) the
entities are preserved as-is.

Fixes #143
This commit is contained in:
Yorick Peterse 2016-02-09 19:51:53 +01:00
parent 76b183e7ab
commit 5bfc2d50f2
2 changed files with 23 additions and 4 deletions

View File

@ -74,14 +74,14 @@ module Oga
input = input.gsub(REGULAR_ENTITY, mapping)
if input.include?(AMPERSAND)
input = input.gsub(NUMERIC_CODE_POINT_ENTITY) do
[Integer($1, 10)].pack('U*')
input = input.gsub(NUMERIC_CODE_POINT_ENTITY) do |found|
pack_string($1, 10) || found
end
end
if input.include?(AMPERSAND)
input = input.gsub(HEX_CODE_POINT_ENTITY) do
[Integer($1, 16)].pack('U*')
input = input.gsub(HEX_CODE_POINT_ENTITY) do |found|
pack_string($1, 16) || found
end
end
@ -104,6 +104,17 @@ module Oga
def self.encode_attribute(input)
input.gsub(ENCODE_ATTRIBUTE_REGEXP, ENCODE_ATTRIBUTE_MAPPING)
end
private
# @param [String] input
# @param [Fixnum] base
# @return [String]
def self.pack_string(input, base)
packed = [Integer(input, base)].pack('U*')
packed.valid_encoding? ? packed : nil
end
end # Entities
end # XML
end # Oga

View File

@ -85,6 +85,14 @@ describe Oga::XML::Entities do
it 'preserves entity-like letters in non-hex mode' do
described_class.decode('&#123A;').should == '&#123A;'
end
it "preserves numeric entities when they can't be decoded" do
described_class.decode('�').should == '�'
end
it "preserves hex entities when they can't be decoded" do
described_class.decode('�').should == '�'
end
end
describe 'encode' do