Encode/decode XML entities.

When lexing XML entities such as & and < these sequences are now
converted into their "actual" forms. In turn, Oga::XML::Text#to_xml ensures they
are encoded when the method is called.

Performance wise this puts some strain on the lexer, for every T_TEXT/T_STRING
node now potentially has to have its content modified. In the benchmark
xml/lexer/string_average_bench.rb the average processing time is now about the
same as before the improvements made in
8db77c0a09. I was hoping that the lexer would
still be a bit faster, but alas this is not the case. Doing this in native code
would be a nightmare as C doesn't have a proper string replacement function. I'm
not old/sadistic enough to write on myself just yet.

This fixes #49
This commit is contained in:
Yorick Peterse 2014-09-28 21:35:00 +02:00
parent 3307e2f4d2
commit 5f7256eb8f
9 changed files with 164 additions and 8 deletions

View File

@ -21,6 +21,7 @@ end
#:nocov:
require_relative 'oga/xml/html_void_elements'
require_relative 'oga/xml/entities'
require_relative 'oga/xml/querying'
require_relative 'oga/xml/traversal'
require_relative 'oga/xml/node'

View File

@ -87,7 +87,9 @@ module Oga
full_name = name
end
return %Q(#{full_name}="#{value}")
enc_value = value ? Entities.encode(value) : nil
return %Q(#{full_name}="#{enc_value}")
end
##

56
lib/oga/xml/entities.rb Normal file
View File

@ -0,0 +1,56 @@
module Oga
module XML
module Entities
##
# Hash containing XML entities and the corresponding characters.
#
# The `&` mapping must come first to ensure proper conversion of non
# encoded to encoded forms (see {Oga::XML::Text#to_xml}).
#
# @return [Hash]
#
DECODE_MAPPING = {
'&' => '&',
'&lt;' => '<',
'&gt;' => '>'
}
##
# Hash containing characters and the corresponding XML entities.
#
# @return [Hash]
#
ENCODE_MAPPING = DECODE_MAPPING.invert
##
# Decodes XML entities.
#
# @param [String] input
# @return [String]
#
def self.decode(input)
if input.include?('&')
DECODE_MAPPING.each do |find, replace|
input = input.gsub(find, replace)
end
end
return input
end
##
# Encodes special characters as XML entities.
#
# @param [String] input
# @return [String]
#
def self.encode(input)
ENCODE_MAPPING.each do |from, to|
input = input.gsub(from, to) if input.include?(from)
end
return input
end
end # Entities
end # XML
end # Oga

View File

@ -194,7 +194,7 @@ module Oga
# @param [String] value The data between the quotes.
#
def on_string(value)
add_token(:T_STRING, value)
add_token(:T_STRING, Entities.decode(value))
end
##
@ -348,7 +348,9 @@ module Oga
# @param [String] value
#
def on_text(value)
add_token(:T_TEXT, value) unless value.empty?
return if value.empty?
add_token(:T_TEXT, Entities.decode(value))
end
##

View File

@ -5,7 +5,12 @@ module Oga
# have any children, attributes and the likes; just text.
#
class Text < CharacterNode
##
# @see [Oga::XML::CharacterNode#to_xml]
#
def to_xml
return Entities.encode(super)
end
end # Text
end # XML
end # Oga

View File

@ -78,6 +78,12 @@ describe Oga::XML::Attribute do
attr.to_xml.should == 'xmlns:class=""'
end
example 'convert special characters to XML entities' do
attr = described_class.new(:name => 'href', :value => '&<>')
attr.to_xml.should == 'href="&amp;&lt;&gt;"'
end
end
context '#inspect' do

View File

@ -0,0 +1,31 @@
require 'spec_helper'
describe Oga::XML::Entities do
context 'decode' do
example 'decode &amp; into &' do
described_class.decode('&amp;').should == '&'
end
example 'decode &lt; into <' do
described_class.decode('&lt;').should == '<'
end
example 'decode &gt; into >' do
described_class.decode('&gt;').should == '>'
end
end
context 'encode' do
example 'encode & as &amp;' do
described_class.encode('&').should == '&amp;'
end
example 'encode < as &lt;' do
described_class.encode('<').should == '&lt;'
end
example 'encode > as &gt;' do
described_class.encode('>').should == '&gt;'
end
end
end

View File

@ -0,0 +1,49 @@
require 'spec_helper'
describe Oga::XML::Lexer do
context 'converting XML entities in text tokens' do
example 'convert &amp; into &' do
lex('&amp;').should == [[:T_TEXT, '&', 1]]
end
example 'convert &lt; into <' do
lex('&lt;').should == [[:T_TEXT, '<', 1]]
end
example 'convert &gt; into >' do
lex('&gt;').should == [[:T_TEXT, '>', 1]]
end
end
context 'converting XML entities in string tokens' do
example 'convert &amp; into &' do
lex('<foo class="&amp;" />').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'foo', 1],
[:T_ATTR, 'class', 1],
[:T_STRING, '&', 1],
[:T_ELEM_END, nil, 1]
]
end
example 'convert &lt; into <' do
lex('<foo class="&lt;" />').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'foo', 1],
[:T_ATTR, 'class', 1],
[:T_STRING, '<', 1],
[:T_ELEM_END, nil, 1]
]
end
example 'convert &gt; into >' do
lex('<foo class="&gt;" />').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'foo', 1],
[:T_ATTR, 'class', 1],
[:T_STRING, '>', 1],
[:T_ELEM_END, nil, 1]
]
end
end
end

View File

@ -15,12 +15,16 @@ describe Oga::XML::Text do
end
context '#to_xml' do
before do
@instance = described_class.new(:text => 'foo')
example 'generate the corresponding XML' do
node = described_class.new(:text => 'foo')
node.to_xml.should == 'foo'
end
example 'generate the corresponding XML' do
@instance.to_xml.should == 'foo'
example 'encode special characters as XML entities' do
node = described_class.new(:text => '&<>')
node.to_xml.should == '&amp;&lt;&gt;'
end
end