Encode/decode XML entities.
When lexing XML entities such as & and < these sequences are now
converted into their "actual" forms. In turn, Oga::XML::Text#to_xml ensures they
are encoded when the method is called.
Performance wise this puts some strain on the lexer, for every T_TEXT/T_STRING
node now potentially has to have its content modified. In the benchmark
xml/lexer/string_average_bench.rb the average processing time is now about the
same as before the improvements made in
8db77c0a09
. I was hoping that the lexer would
still be a bit faster, but alas this is not the case. Doing this in native code
would be a nightmare as C doesn't have a proper string replacement function. I'm
not old/sadistic enough to write on myself just yet.
This fixes #49
This commit is contained in:
parent
3307e2f4d2
commit
5f7256eb8f
|
@ -21,6 +21,7 @@ end
|
|||
#:nocov:
|
||||
|
||||
require_relative 'oga/xml/html_void_elements'
|
||||
require_relative 'oga/xml/entities'
|
||||
require_relative 'oga/xml/querying'
|
||||
require_relative 'oga/xml/traversal'
|
||||
require_relative 'oga/xml/node'
|
||||
|
|
|
@ -87,7 +87,9 @@ module Oga
|
|||
full_name = name
|
||||
end
|
||||
|
||||
return %Q(#{full_name}="#{value}")
|
||||
enc_value = value ? Entities.encode(value) : nil
|
||||
|
||||
return %Q(#{full_name}="#{enc_value}")
|
||||
end
|
||||
|
||||
##
|
||||
|
|
|
@ -0,0 +1,56 @@
|
|||
module Oga
|
||||
module XML
|
||||
module Entities
|
||||
##
|
||||
# Hash containing XML entities and the corresponding characters.
|
||||
#
|
||||
# The `&` mapping must come first to ensure proper conversion of non
|
||||
# encoded to encoded forms (see {Oga::XML::Text#to_xml}).
|
||||
#
|
||||
# @return [Hash]
|
||||
#
|
||||
DECODE_MAPPING = {
|
||||
'&' => '&',
|
||||
'<' => '<',
|
||||
'>' => '>'
|
||||
}
|
||||
|
||||
##
|
||||
# Hash containing characters and the corresponding XML entities.
|
||||
#
|
||||
# @return [Hash]
|
||||
#
|
||||
ENCODE_MAPPING = DECODE_MAPPING.invert
|
||||
|
||||
##
|
||||
# Decodes XML entities.
|
||||
#
|
||||
# @param [String] input
|
||||
# @return [String]
|
||||
#
|
||||
def self.decode(input)
|
||||
if input.include?('&')
|
||||
DECODE_MAPPING.each do |find, replace|
|
||||
input = input.gsub(find, replace)
|
||||
end
|
||||
end
|
||||
|
||||
return input
|
||||
end
|
||||
|
||||
##
|
||||
# Encodes special characters as XML entities.
|
||||
#
|
||||
# @param [String] input
|
||||
# @return [String]
|
||||
#
|
||||
def self.encode(input)
|
||||
ENCODE_MAPPING.each do |from, to|
|
||||
input = input.gsub(from, to) if input.include?(from)
|
||||
end
|
||||
|
||||
return input
|
||||
end
|
||||
end # Entities
|
||||
end # XML
|
||||
end # Oga
|
|
@ -194,7 +194,7 @@ module Oga
|
|||
# @param [String] value The data between the quotes.
|
||||
#
|
||||
def on_string(value)
|
||||
add_token(:T_STRING, value)
|
||||
add_token(:T_STRING, Entities.decode(value))
|
||||
end
|
||||
|
||||
##
|
||||
|
@ -348,7 +348,9 @@ module Oga
|
|||
# @param [String] value
|
||||
#
|
||||
def on_text(value)
|
||||
add_token(:T_TEXT, value) unless value.empty?
|
||||
return if value.empty?
|
||||
|
||||
add_token(:T_TEXT, Entities.decode(value))
|
||||
end
|
||||
|
||||
##
|
||||
|
|
|
@ -5,7 +5,12 @@ module Oga
|
|||
# have any children, attributes and the likes; just text.
|
||||
#
|
||||
class Text < CharacterNode
|
||||
|
||||
##
|
||||
# @see [Oga::XML::CharacterNode#to_xml]
|
||||
#
|
||||
def to_xml
|
||||
return Entities.encode(super)
|
||||
end
|
||||
end # Text
|
||||
end # XML
|
||||
end # Oga
|
||||
|
|
|
@ -78,6 +78,12 @@ describe Oga::XML::Attribute do
|
|||
|
||||
attr.to_xml.should == 'xmlns:class=""'
|
||||
end
|
||||
|
||||
example 'convert special characters to XML entities' do
|
||||
attr = described_class.new(:name => 'href', :value => '&<>')
|
||||
|
||||
attr.to_xml.should == 'href="&<>"'
|
||||
end
|
||||
end
|
||||
|
||||
context '#inspect' do
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::XML::Entities do
|
||||
context 'decode' do
|
||||
example 'decode & into &' do
|
||||
described_class.decode('&').should == '&'
|
||||
end
|
||||
|
||||
example 'decode < into <' do
|
||||
described_class.decode('<').should == '<'
|
||||
end
|
||||
|
||||
example 'decode > into >' do
|
||||
described_class.decode('>').should == '>'
|
||||
end
|
||||
end
|
||||
|
||||
context 'encode' do
|
||||
example 'encode & as &' do
|
||||
described_class.encode('&').should == '&'
|
||||
end
|
||||
|
||||
example 'encode < as <' do
|
||||
described_class.encode('<').should == '<'
|
||||
end
|
||||
|
||||
example 'encode > as >' do
|
||||
described_class.encode('>').should == '>'
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,49 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::XML::Lexer do
|
||||
context 'converting XML entities in text tokens' do
|
||||
example 'convert & into &' do
|
||||
lex('&').should == [[:T_TEXT, '&', 1]]
|
||||
end
|
||||
|
||||
example 'convert < into <' do
|
||||
lex('<').should == [[:T_TEXT, '<', 1]]
|
||||
end
|
||||
|
||||
example 'convert > into >' do
|
||||
lex('>').should == [[:T_TEXT, '>', 1]]
|
||||
end
|
||||
end
|
||||
|
||||
context 'converting XML entities in string tokens' do
|
||||
example 'convert & into &' do
|
||||
lex('<foo class="&" />').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'foo', 1],
|
||||
[:T_ATTR, 'class', 1],
|
||||
[:T_STRING, '&', 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
example 'convert < into <' do
|
||||
lex('<foo class="<" />').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'foo', 1],
|
||||
[:T_ATTR, 'class', 1],
|
||||
[:T_STRING, '<', 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
example 'convert > into >' do
|
||||
lex('<foo class=">" />').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'foo', 1],
|
||||
[:T_ATTR, 'class', 1],
|
||||
[:T_STRING, '>', 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
end
|
||||
end
|
|
@ -15,12 +15,16 @@ describe Oga::XML::Text do
|
|||
end
|
||||
|
||||
context '#to_xml' do
|
||||
before do
|
||||
@instance = described_class.new(:text => 'foo')
|
||||
example 'generate the corresponding XML' do
|
||||
node = described_class.new(:text => 'foo')
|
||||
|
||||
node.to_xml.should == 'foo'
|
||||
end
|
||||
|
||||
example 'generate the corresponding XML' do
|
||||
@instance.to_xml.should == 'foo'
|
||||
example 'encode special characters as XML entities' do
|
||||
node = described_class.new(:text => '&<>')
|
||||
|
||||
node.to_xml.should == '&<>'
|
||||
end
|
||||
end
|
||||
|
||||
|
|
Loading…
Reference in New Issue