Splits entity decoding into two steps.

Doing decimal & hex decoding separately results in a nicer code
which does not refer to string matching and slicing.
This commit is contained in:
Jakub Pawlowicz 2015-06-30 09:50:20 +01:00 committed by Yorick Peterse
parent 6fc3ef425b
commit 6786dde3f0
1 changed files with 17 additions and 4 deletions

View File

@ -59,11 +59,18 @@ module Oga
REGULAR_ENTITY = /&[a-zA-Z0-9]+;/
##
# Regexp for matching XML/HTML entities such as "&".
# Regexp for matching XML/HTML numeric entities such as "&".
#
# @return [Regexp]
#
CODEPOINT_ENTITY = /&#(x[a-fA-F0-9]+|\d+);/
NUMERIC_CODE_POINT_ENTITY = /&#(\d+);/
##
# Regexp for matching XML/HTML hex entities such as "<".
#
# @return [Regexp]
#
HEX_CODE_POINT_ENTITY = /&#x([a-fA-F0-9]+);/
##
# @return [Regexp]
@ -89,8 +96,14 @@ module Oga
input = input.gsub(REGULAR_ENTITY, mapping)
if input.include?(AMPERSAND)
input = input.gsub(CODEPOINT_ENTITY) do |match|
[$1.start_with?('x') ? Integer($1[1..-1], 16) : Integer($1, 10)].pack('U*')
input = input.gsub(NUMERIC_CODE_POINT_ENTITY) do
[Integer($1, 10)].pack('U*')
end
end
if input.include?(AMPERSAND)
input = input.gsub(HEX_CODE_POINT_ENTITY) do
[Integer($1, 16)].pack('U*')
end
end