Lazy decoding of XML/HTML entities.
Instead of decoding entities in the lexer we'll do this whenever XML::Text#text is called. This removes the overhead from the parsing phase and ensures the process is only triggered when actually needed. Note that calling #to_xml and/or the #inspect methods on a Text (or parent) instance will also trigger the entity conversion process. The new entity decoding API supports both regular entities (e.g. &) as well as codepoint based entities (both regular and hexadecimal codepoints). To allow safe read-only access to Text instances from multiple threads a mutex is used. This mutex ensures that only 1 thread can trigger the conversion process. Fixes #68
This commit is contained in:
parent
7409257702
commit
2ec91f130f
|
@ -3,6 +3,7 @@ gem 'racc'
|
|||
require 'ast'
|
||||
require 'set'
|
||||
require 'stringio'
|
||||
require 'thread'
|
||||
|
||||
require_relative 'oga/version'
|
||||
require_relative 'oga/oga'
|
||||
|
@ -43,6 +44,7 @@ require_relative 'oga/xml/pull_parser'
|
|||
|
||||
require_relative 'oga/html/parser'
|
||||
require_relative 'oga/html/sax_parser'
|
||||
require_relative 'oga/html/entities'
|
||||
|
||||
require_relative 'oga/xpath/lexer'
|
||||
require_relative 'oga/xpath/parser'
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,5 +1,9 @@
|
|||
module Oga
|
||||
module XML
|
||||
##
|
||||
# Module for encoding/decoding XML and HTML entities. The mapping of HTML
|
||||
# entities can be found in {Oga::HTML::Entities::DECODE_MAPPING}.
|
||||
#
|
||||
module Entities
|
||||
##
|
||||
# Hash containing XML entities and the corresponding characters.
|
||||
|
@ -11,15 +15,10 @@ module Oga
|
|||
#
|
||||
DECODE_MAPPING = {
|
||||
'<' => '<',
|
||||
'<' => '<',
|
||||
'>' => '>',
|
||||
'>' => '>',
|
||||
''' => "'",
|
||||
''' => "'",
|
||||
'"' => '"',
|
||||
'"' => '"',
|
||||
'&' => '&',
|
||||
'&' => '&',
|
||||
}
|
||||
|
||||
##
|
||||
|
@ -35,16 +34,46 @@ module Oga
|
|||
'<' => '<',
|
||||
}
|
||||
|
||||
##
|
||||
# @return [String]
|
||||
#
|
||||
AMPERSAND = '&'.freeze
|
||||
|
||||
##
|
||||
# Regexp for matching XML/HTML entities such as " ".
|
||||
#
|
||||
# @return [Regexp]
|
||||
#
|
||||
REGULAR_ENTITY = /&[a-zA-Z]+;/
|
||||
|
||||
##
|
||||
# Regexp for matching XML/HTML entities such as "&".
|
||||
#
|
||||
# @return [Regexp]
|
||||
#
|
||||
CODEPOINT_ENTITY = /&#(x)?([a-zA-Z0-9]+);/
|
||||
|
||||
##
|
||||
# @return [Regexp]
|
||||
#
|
||||
ENCODE_REGEXP = Regexp.new(ENCODE_MAPPING.keys.join('|'))
|
||||
|
||||
##
|
||||
# Decodes XML entities.
|
||||
#
|
||||
# @param [String] input
|
||||
# @param [Array] keys
|
||||
# @param [Hash] mapping
|
||||
# @return [String]
|
||||
#
|
||||
def self.decode(input)
|
||||
if input.include?('&')
|
||||
DECODE_MAPPING.each do |find, replace|
|
||||
input = input.gsub(find, replace)
|
||||
def self.decode(input, mapping = DECODE_MAPPING)
|
||||
return input unless input.include?(AMPERSAND)
|
||||
|
||||
input = input.gsub(REGULAR_ENTITY, mapping)
|
||||
|
||||
if input.include?(AMPERSAND)
|
||||
input = input.gsub(CODEPOINT_ENTITY) do |match|
|
||||
[$1 ? Integer($2, 16) : Integer($2)].pack('U')
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -55,14 +84,11 @@ module Oga
|
|||
# Encodes special characters as XML entities.
|
||||
#
|
||||
# @param [String] input
|
||||
# @param [Hash] mapping
|
||||
# @return [String]
|
||||
#
|
||||
def self.encode(input)
|
||||
ENCODE_MAPPING.each do |from, to|
|
||||
input = input.gsub(from, to) if input.include?(from)
|
||||
end
|
||||
|
||||
return input
|
||||
def self.encode(input, mapping = ENCODE_MAPPING)
|
||||
return input.gsub(ENCODE_REGEXP, mapping)
|
||||
end
|
||||
end # Entities
|
||||
end # XML
|
||||
|
|
|
@ -217,7 +217,7 @@ module Oga
|
|||
# @param [String] value The data between the quotes.
|
||||
#
|
||||
def on_string_body(value)
|
||||
add_token(:T_STRING_BODY, Entities.decode(value))
|
||||
add_token(:T_STRING_BODY, value)
|
||||
end
|
||||
|
||||
##
|
||||
|
@ -373,7 +373,7 @@ module Oga
|
|||
def on_text(value)
|
||||
return if value.empty?
|
||||
|
||||
add_token(:T_TEXT, Entities.decode(value))
|
||||
add_token(:T_TEXT, value)
|
||||
end
|
||||
|
||||
##
|
||||
|
|
|
@ -5,19 +5,65 @@ module Oga
|
|||
# have any children, attributes and the likes; just text.
|
||||
#
|
||||
class Text < CharacterNode
|
||||
def initialize(*args)
|
||||
super
|
||||
|
||||
@mutex = Mutex.new
|
||||
@decoded = false
|
||||
end
|
||||
|
||||
##
|
||||
# @param [String] value
|
||||
#
|
||||
def text=(value)
|
||||
# In case of concurrent text/text= calls.
|
||||
@mutex.synchronize do
|
||||
@decoded = false
|
||||
@text = value
|
||||
end
|
||||
end
|
||||
|
||||
##
|
||||
# Returns the text as a String. Upon the first call any XML/HTML entities
|
||||
# are decoded.
|
||||
#
|
||||
# @return [String]
|
||||
#
|
||||
def text
|
||||
@mutex.synchronize do
|
||||
unless @decoded
|
||||
decoder = html? ? HTML::Entities : Entities
|
||||
@text = decoder.decode(@text)
|
||||
@decoded = true
|
||||
end
|
||||
end
|
||||
|
||||
return @text
|
||||
end
|
||||
|
||||
##
|
||||
# @see [Oga::XML::CharacterNode#to_xml]
|
||||
#
|
||||
def to_xml
|
||||
node = parent
|
||||
root = root_node
|
||||
|
||||
if root.is_a?(Document) and node.is_a?(Element) and root.html? \
|
||||
if node.is_a?(Element) and html? \
|
||||
and Lexer::LITERAL_HTML_ELEMENTS.include?(node.name)
|
||||
return super
|
||||
else
|
||||
return Entities.encode(super)
|
||||
end
|
||||
|
||||
return Entities.encode(super)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
##
|
||||
# @return [TrueClass|FalseClass]
|
||||
#
|
||||
def html?
|
||||
root = root_node
|
||||
|
||||
return root.is_a?(Document) && root.html?
|
||||
end
|
||||
end # Text
|
||||
end # XML
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
# encoding: utf-8
|
||||
|
||||
require 'spec_helper'
|
||||
|
||||
describe Oga::HTML::Entities do
|
||||
describe 'decode' do
|
||||
it 'decodes & into &' do
|
||||
described_class.decode('&').should == '&'
|
||||
end
|
||||
|
||||
it 'decodes λ into λ' do
|
||||
described_class.decode('λ').should == 'λ'
|
||||
end
|
||||
end
|
||||
end
|
|
@ -65,6 +65,10 @@ describe Oga::XML::Entities do
|
|||
it 'decodes &&lt; into &<' do
|
||||
described_class.decode('&&lt;').should == '&<'
|
||||
end
|
||||
|
||||
it 'decodes < into <' do
|
||||
described_class.decode('<').should == '<'
|
||||
end
|
||||
end
|
||||
|
||||
describe 'encode' do
|
||||
|
|
|
@ -1,55 +0,0 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::XML::Lexer do
|
||||
describe 'converting XML entities in text tokens' do
|
||||
it 'converts & into &' do
|
||||
lex('&').should == [[:T_TEXT, '&', 1]]
|
||||
end
|
||||
|
||||
it 'converts < into <' do
|
||||
lex('<').should == [[:T_TEXT, '<', 1]]
|
||||
end
|
||||
|
||||
it 'converts > into >' do
|
||||
lex('>').should == [[:T_TEXT, '>', 1]]
|
||||
end
|
||||
end
|
||||
|
||||
describe 'converting XML entities in string tokens' do
|
||||
it 'converts & into &' do
|
||||
lex('<foo class="&" />').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'foo', 1],
|
||||
[:T_ATTR, 'class', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, '&', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'converts < into <' do
|
||||
lex('<foo class="<" />').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'foo', 1],
|
||||
[:T_ATTR, 'class', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, '<', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'converts > into >' do
|
||||
lex('<foo class=">" />').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'foo', 1],
|
||||
[:T_ATTR, 'class', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, '>', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
end
|
||||
end
|
|
@ -14,6 +14,79 @@ describe Oga::XML::Text do
|
|||
end
|
||||
end
|
||||
|
||||
describe '#text' do
|
||||
describe 'with XML entities' do
|
||||
it 'converts & to &' do
|
||||
described_class.new(:text => '&').text.should == '&'
|
||||
end
|
||||
|
||||
it 'converts < to <' do
|
||||
described_class.new(:text => '<').text.should == '<'
|
||||
end
|
||||
|
||||
it 'converts > to >' do
|
||||
described_class.new(:text => '>').text.should == '>'
|
||||
end
|
||||
|
||||
it 'caches the converted text' do
|
||||
node = described_class.new(:text => '&')
|
||||
|
||||
Oga::XML::Entities.should_receive(:decode).once.and_call_original
|
||||
|
||||
node.text.should == '&'
|
||||
node.text.should == '&'
|
||||
end
|
||||
|
||||
it 'converts new text set using text=' do
|
||||
node = described_class.new(:text => '&')
|
||||
|
||||
node.text.should == '&'
|
||||
|
||||
node.text = '<'
|
||||
|
||||
node.text.should == '<'
|
||||
end
|
||||
end
|
||||
|
||||
describe 'with HTML entities' do
|
||||
before do
|
||||
@document = Oga::XML::Document.new(:type => :html)
|
||||
end
|
||||
|
||||
it 'converts & to &' do
|
||||
node = described_class.new(:text => '&')
|
||||
|
||||
@document.children << node
|
||||
|
||||
node.text.should == '&'
|
||||
end
|
||||
|
||||
it 'converts < to <' do
|
||||
node = described_class.new(:text => '<')
|
||||
|
||||
@document.children << node
|
||||
|
||||
node.text.should == '<'
|
||||
end
|
||||
|
||||
it 'converts > to >' do
|
||||
node = described_class.new(:text => '>')
|
||||
|
||||
@document.children << node
|
||||
|
||||
node.text.should == '>'
|
||||
end
|
||||
|
||||
it 'converts into a space' do
|
||||
node = described_class.new(:text => ' ')
|
||||
|
||||
@document.children << node
|
||||
|
||||
node.text.should == [160].pack('U')
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe '#to_xml' do
|
||||
it 'generates the corresponding XML' do
|
||||
node = described_class.new(:text => 'foo')
|
||||
|
|
Loading…
Reference in New Issue