Lazy decoding of XML/HTML entities.

Instead of decoding entities in the lexer we'll do this whenever XML::Text#text
is called. This removes the overhead from the parsing phase and ensures the
process is only triggered when actually needed. Note that calling #to_xml and/or
the #inspect methods on a Text (or parent) instance will also trigger the entity
conversion process.

The new entity decoding API supports both regular entities (e.g. &) as well
as codepoint based entities (both regular and hexadecimal codepoints).

To allow safe read-only access to Text instances from multiple threads a mutex
is used. This mutex ensures that only 1 thread can trigger the conversion
process.

Fixes #68
This commit is contained in:
Yorick Peterse 2015-03-05 23:00:43 +01:00
parent 7409257702
commit 2ec91f130f
9 changed files with 2337 additions and 76 deletions

View File

@ -3,6 +3,7 @@ gem 'racc'
require 'ast'
require 'set'
require 'stringio'
require 'thread'
require_relative 'oga/version'
require_relative 'oga/oga'
@ -43,6 +44,7 @@ require_relative 'oga/xml/pull_parser'
require_relative 'oga/html/parser'
require_relative 'oga/html/sax_parser'
require_relative 'oga/html/entities'
require_relative 'oga/xpath/lexer'
require_relative 'oga/xpath/parser'

2150
lib/oga/html/entities.rb Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,9 @@
module Oga
module XML
##
# Module for encoding/decoding XML and HTML entities. The mapping of HTML
# entities can be found in {Oga::HTML::Entities::DECODE_MAPPING}.
#
module Entities
##
# Hash containing XML entities and the corresponding characters.
@ -11,15 +15,10 @@ module Oga
#
DECODE_MAPPING = {
'&lt;' => '<',
'&#60;' => '<',
'&gt;' => '>',
'&#62;' => '>',
'&apos;' => "'",
'&#39;' => "'",
'&quot;' => '"',
'&#34;' => '"',
'&amp;' => '&',
'&#38;' => '&',
}
##
@ -35,16 +34,46 @@ module Oga
'<' => '&lt;',
}
##
# @return [String]
#
AMPERSAND = '&'.freeze
##
# Regexp for matching XML/HTML entities such as "&nbsp;".
#
# @return [Regexp]
#
REGULAR_ENTITY = /&[a-zA-Z]+;/
##
# Regexp for matching XML/HTML entities such as "&#38;".
#
# @return [Regexp]
#
CODEPOINT_ENTITY = /&#(x)?([a-zA-Z0-9]+);/
##
# @return [Regexp]
#
ENCODE_REGEXP = Regexp.new(ENCODE_MAPPING.keys.join('|'))
##
# Decodes XML entities.
#
# @param [String] input
# @param [Array] keys
# @param [Hash] mapping
# @return [String]
#
def self.decode(input)
if input.include?('&')
DECODE_MAPPING.each do |find, replace|
input = input.gsub(find, replace)
def self.decode(input, mapping = DECODE_MAPPING)
return input unless input.include?(AMPERSAND)
input = input.gsub(REGULAR_ENTITY, mapping)
if input.include?(AMPERSAND)
input = input.gsub(CODEPOINT_ENTITY) do |match|
[$1 ? Integer($2, 16) : Integer($2)].pack('U')
end
end
@ -55,14 +84,11 @@ module Oga
# Encodes special characters as XML entities.
#
# @param [String] input
# @param [Hash] mapping
# @return [String]
#
def self.encode(input)
ENCODE_MAPPING.each do |from, to|
input = input.gsub(from, to) if input.include?(from)
end
return input
def self.encode(input, mapping = ENCODE_MAPPING)
return input.gsub(ENCODE_REGEXP, mapping)
end
end # Entities
end # XML

View File

@ -217,7 +217,7 @@ module Oga
# @param [String] value The data between the quotes.
#
def on_string_body(value)
add_token(:T_STRING_BODY, Entities.decode(value))
add_token(:T_STRING_BODY, value)
end
##
@ -373,7 +373,7 @@ module Oga
def on_text(value)
return if value.empty?
add_token(:T_TEXT, Entities.decode(value))
add_token(:T_TEXT, value)
end
##

View File

@ -5,19 +5,65 @@ module Oga
# have any children, attributes and the likes; just text.
#
class Text < CharacterNode
def initialize(*args)
super
@mutex = Mutex.new
@decoded = false
end
##
# @param [String] value
#
def text=(value)
# In case of concurrent text/text= calls.
@mutex.synchronize do
@decoded = false
@text = value
end
end
##
# Returns the text as a String. Upon the first call any XML/HTML entities
# are decoded.
#
# @return [String]
#
def text
@mutex.synchronize do
unless @decoded
decoder = html? ? HTML::Entities : Entities
@text = decoder.decode(@text)
@decoded = true
end
end
return @text
end
##
# @see [Oga::XML::CharacterNode#to_xml]
#
def to_xml
node = parent
root = root_node
if root.is_a?(Document) and node.is_a?(Element) and root.html? \
if node.is_a?(Element) and html? \
and Lexer::LITERAL_HTML_ELEMENTS.include?(node.name)
return super
else
return Entities.encode(super)
end
return Entities.encode(super)
end
private
##
# @return [TrueClass|FalseClass]
#
def html?
root = root_node
return root.is_a?(Document) && root.html?
end
end # Text
end # XML

View File

@ -0,0 +1,15 @@
# encoding: utf-8
require 'spec_helper'
describe Oga::HTML::Entities do
describe 'decode' do
it 'decodes &amp; into &' do
described_class.decode('&amp;').should == '&'
end
it 'decodes &lambda; into λ' do
described_class.decode('&lambda;').should == 'λ'
end
end
end

View File

@ -65,6 +65,10 @@ describe Oga::XML::Entities do
it 'decodes &amp;&amp;lt; into &<' do
described_class.decode('&amp;&amp;lt;').should == '&&lt;'
end
it 'decodes &#x3C; into <' do
described_class.decode('&#x3C;').should == '<'
end
end
describe 'encode' do

View File

@ -1,55 +0,0 @@
require 'spec_helper'
describe Oga::XML::Lexer do
describe 'converting XML entities in text tokens' do
it 'converts &amp; into &' do
lex('&amp;').should == [[:T_TEXT, '&', 1]]
end
it 'converts &lt; into <' do
lex('&lt;').should == [[:T_TEXT, '<', 1]]
end
it 'converts &gt; into >' do
lex('&gt;').should == [[:T_TEXT, '>', 1]]
end
end
describe 'converting XML entities in string tokens' do
it 'converts &amp; into &' do
lex('<foo class="&amp;" />').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'foo', 1],
[:T_ATTR, 'class', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, '&', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
it 'converts &lt; into <' do
lex('<foo class="&lt;" />').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'foo', 1],
[:T_ATTR, 'class', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, '<', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
it 'converts &gt; into >' do
lex('<foo class="&gt;" />').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'foo', 1],
[:T_ATTR, 'class', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, '>', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
end
end

View File

@ -14,6 +14,79 @@ describe Oga::XML::Text do
end
end
describe '#text' do
describe 'with XML entities' do
it 'converts &amp; to &' do
described_class.new(:text => '&amp;').text.should == '&'
end
it 'converts &lt; to <' do
described_class.new(:text => '&lt;').text.should == '<'
end
it 'converts &gt; to >' do
described_class.new(:text => '&gt;').text.should == '>'
end
it 'caches the converted text' do
node = described_class.new(:text => '&amp;')
Oga::XML::Entities.should_receive(:decode).once.and_call_original
node.text.should == '&'
node.text.should == '&'
end
it 'converts new text set using text=' do
node = described_class.new(:text => '&amp;')
node.text.should == '&'
node.text = '&lt;'
node.text.should == '<'
end
end
describe 'with HTML entities' do
before do
@document = Oga::XML::Document.new(:type => :html)
end
it 'converts &amp; to &' do
node = described_class.new(:text => '&amp;')
@document.children << node
node.text.should == '&'
end
it 'converts &lt; to <' do
node = described_class.new(:text => '&lt;')
@document.children << node
node.text.should == '<'
end
it 'converts &gt; to >' do
node = described_class.new(:text => '&gt;')
@document.children << node
node.text.should == '>'
end
it 'converts &nbsp; into a space' do
node = described_class.new(:text => '&nbsp;')
@document.children << node
node.text.should == [160].pack('U')
end
end
end
describe '#to_xml' do
it 'generates the corresponding XML' do
node = described_class.new(:text => 'foo')