Encode/decode XML entities.
When lexing XML entities such as & and < these sequences are now
converted into their "actual" forms. In turn, Oga::XML::Text#to_xml ensures they
are encoded when the method is called.
Performance wise this puts some strain on the lexer, for every T_TEXT/T_STRING
node now potentially has to have its content modified. In the benchmark
xml/lexer/string_average_bench.rb the average processing time is now about the
same as before the improvements made in
8db77c0a09
. I was hoping that the lexer would
still be a bit faster, but alas this is not the case. Doing this in native code
would be a nightmare as C doesn't have a proper string replacement function. I'm
not old/sadistic enough to write on myself just yet.
This fixes #49
This commit is contained in:
parent
3307e2f4d2
commit
5f7256eb8f
|
@ -21,6 +21,7 @@ end
|
||||||
#:nocov:
|
#:nocov:
|
||||||
|
|
||||||
require_relative 'oga/xml/html_void_elements'
|
require_relative 'oga/xml/html_void_elements'
|
||||||
|
require_relative 'oga/xml/entities'
|
||||||
require_relative 'oga/xml/querying'
|
require_relative 'oga/xml/querying'
|
||||||
require_relative 'oga/xml/traversal'
|
require_relative 'oga/xml/traversal'
|
||||||
require_relative 'oga/xml/node'
|
require_relative 'oga/xml/node'
|
||||||
|
|
|
@ -87,7 +87,9 @@ module Oga
|
||||||
full_name = name
|
full_name = name
|
||||||
end
|
end
|
||||||
|
|
||||||
return %Q(#{full_name}="#{value}")
|
enc_value = value ? Entities.encode(value) : nil
|
||||||
|
|
||||||
|
return %Q(#{full_name}="#{enc_value}")
|
||||||
end
|
end
|
||||||
|
|
||||||
##
|
##
|
||||||
|
|
|
@ -0,0 +1,56 @@
|
||||||
|
module Oga
|
||||||
|
module XML
|
||||||
|
module Entities
|
||||||
|
##
|
||||||
|
# Hash containing XML entities and the corresponding characters.
|
||||||
|
#
|
||||||
|
# The `&` mapping must come first to ensure proper conversion of non
|
||||||
|
# encoded to encoded forms (see {Oga::XML::Text#to_xml}).
|
||||||
|
#
|
||||||
|
# @return [Hash]
|
||||||
|
#
|
||||||
|
DECODE_MAPPING = {
|
||||||
|
'&' => '&',
|
||||||
|
'<' => '<',
|
||||||
|
'>' => '>'
|
||||||
|
}
|
||||||
|
|
||||||
|
##
|
||||||
|
# Hash containing characters and the corresponding XML entities.
|
||||||
|
#
|
||||||
|
# @return [Hash]
|
||||||
|
#
|
||||||
|
ENCODE_MAPPING = DECODE_MAPPING.invert
|
||||||
|
|
||||||
|
##
|
||||||
|
# Decodes XML entities.
|
||||||
|
#
|
||||||
|
# @param [String] input
|
||||||
|
# @return [String]
|
||||||
|
#
|
||||||
|
def self.decode(input)
|
||||||
|
if input.include?('&')
|
||||||
|
DECODE_MAPPING.each do |find, replace|
|
||||||
|
input = input.gsub(find, replace)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
return input
|
||||||
|
end
|
||||||
|
|
||||||
|
##
|
||||||
|
# Encodes special characters as XML entities.
|
||||||
|
#
|
||||||
|
# @param [String] input
|
||||||
|
# @return [String]
|
||||||
|
#
|
||||||
|
def self.encode(input)
|
||||||
|
ENCODE_MAPPING.each do |from, to|
|
||||||
|
input = input.gsub(from, to) if input.include?(from)
|
||||||
|
end
|
||||||
|
|
||||||
|
return input
|
||||||
|
end
|
||||||
|
end # Entities
|
||||||
|
end # XML
|
||||||
|
end # Oga
|
|
@ -194,7 +194,7 @@ module Oga
|
||||||
# @param [String] value The data between the quotes.
|
# @param [String] value The data between the quotes.
|
||||||
#
|
#
|
||||||
def on_string(value)
|
def on_string(value)
|
||||||
add_token(:T_STRING, value)
|
add_token(:T_STRING, Entities.decode(value))
|
||||||
end
|
end
|
||||||
|
|
||||||
##
|
##
|
||||||
|
@ -348,7 +348,9 @@ module Oga
|
||||||
# @param [String] value
|
# @param [String] value
|
||||||
#
|
#
|
||||||
def on_text(value)
|
def on_text(value)
|
||||||
add_token(:T_TEXT, value) unless value.empty?
|
return if value.empty?
|
||||||
|
|
||||||
|
add_token(:T_TEXT, Entities.decode(value))
|
||||||
end
|
end
|
||||||
|
|
||||||
##
|
##
|
||||||
|
|
|
@ -5,7 +5,12 @@ module Oga
|
||||||
# have any children, attributes and the likes; just text.
|
# have any children, attributes and the likes; just text.
|
||||||
#
|
#
|
||||||
class Text < CharacterNode
|
class Text < CharacterNode
|
||||||
|
##
|
||||||
|
# @see [Oga::XML::CharacterNode#to_xml]
|
||||||
|
#
|
||||||
|
def to_xml
|
||||||
|
return Entities.encode(super)
|
||||||
|
end
|
||||||
end # Text
|
end # Text
|
||||||
end # XML
|
end # XML
|
||||||
end # Oga
|
end # Oga
|
||||||
|
|
|
@ -78,6 +78,12 @@ describe Oga::XML::Attribute do
|
||||||
|
|
||||||
attr.to_xml.should == 'xmlns:class=""'
|
attr.to_xml.should == 'xmlns:class=""'
|
||||||
end
|
end
|
||||||
|
|
||||||
|
example 'convert special characters to XML entities' do
|
||||||
|
attr = described_class.new(:name => 'href', :value => '&<>')
|
||||||
|
|
||||||
|
attr.to_xml.should == 'href="&<>"'
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
context '#inspect' do
|
context '#inspect' do
|
||||||
|
|
|
@ -0,0 +1,31 @@
|
||||||
|
require 'spec_helper'
|
||||||
|
|
||||||
|
describe Oga::XML::Entities do
|
||||||
|
context 'decode' do
|
||||||
|
example 'decode & into &' do
|
||||||
|
described_class.decode('&').should == '&'
|
||||||
|
end
|
||||||
|
|
||||||
|
example 'decode < into <' do
|
||||||
|
described_class.decode('<').should == '<'
|
||||||
|
end
|
||||||
|
|
||||||
|
example 'decode > into >' do
|
||||||
|
described_class.decode('>').should == '>'
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'encode' do
|
||||||
|
example 'encode & as &' do
|
||||||
|
described_class.encode('&').should == '&'
|
||||||
|
end
|
||||||
|
|
||||||
|
example 'encode < as <' do
|
||||||
|
described_class.encode('<').should == '<'
|
||||||
|
end
|
||||||
|
|
||||||
|
example 'encode > as >' do
|
||||||
|
described_class.encode('>').should == '>'
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,49 @@
|
||||||
|
require 'spec_helper'
|
||||||
|
|
||||||
|
describe Oga::XML::Lexer do
|
||||||
|
context 'converting XML entities in text tokens' do
|
||||||
|
example 'convert & into &' do
|
||||||
|
lex('&').should == [[:T_TEXT, '&', 1]]
|
||||||
|
end
|
||||||
|
|
||||||
|
example 'convert < into <' do
|
||||||
|
lex('<').should == [[:T_TEXT, '<', 1]]
|
||||||
|
end
|
||||||
|
|
||||||
|
example 'convert > into >' do
|
||||||
|
lex('>').should == [[:T_TEXT, '>', 1]]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'converting XML entities in string tokens' do
|
||||||
|
example 'convert & into &' do
|
||||||
|
lex('<foo class="&" />').should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'foo', 1],
|
||||||
|
[:T_ATTR, 'class', 1],
|
||||||
|
[:T_STRING, '&', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
example 'convert < into <' do
|
||||||
|
lex('<foo class="<" />').should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'foo', 1],
|
||||||
|
[:T_ATTR, 'class', 1],
|
||||||
|
[:T_STRING, '<', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
example 'convert > into >' do
|
||||||
|
lex('<foo class=">" />').should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'foo', 1],
|
||||||
|
[:T_ATTR, 'class', 1],
|
||||||
|
[:T_STRING, '>', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -15,12 +15,16 @@ describe Oga::XML::Text do
|
||||||
end
|
end
|
||||||
|
|
||||||
context '#to_xml' do
|
context '#to_xml' do
|
||||||
before do
|
example 'generate the corresponding XML' do
|
||||||
@instance = described_class.new(:text => 'foo')
|
node = described_class.new(:text => 'foo')
|
||||||
|
|
||||||
|
node.to_xml.should == 'foo'
|
||||||
end
|
end
|
||||||
|
|
||||||
example 'generate the corresponding XML' do
|
example 'encode special characters as XML entities' do
|
||||||
@instance.to_xml.should == 'foo'
|
node = described_class.new(:text => '&<>')
|
||||||
|
|
||||||
|
node.to_xml.should == '&<>'
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue