Generate XML without relying on recursion

While using recursion is an easy way of generating XML it can lead to
the call stack overflowing when serialising documents with lots of
nested nodes.

Generally there are two ways of working around this:

1. Use an explicit stack (e.g. an array or a queue of sorts) instead of
   relying on the call stack.
2. Use an algorithm that doesn't use a stack at all (e.g. Morris
   traversal).

This commit introduces the XML::Generator class which can serialize
documents back to XML without using a stack at all. This class takes
advantage of XML nodes having access to not only their child nodes, but
also their siblings and their parents.

All XML serialisation logic now resides in the XML::Generator class. In
turn the various "to_xml" methods just use this class and serialize
everything starting at "self".
This commit is contained in:
Yorick Peterse 2016-09-03 21:24:19 +02:00
parent 9ac16e2e4f
commit dd138981f6
No known key found for this signature in database
GPG Key ID: EDD30D2BEB691AC9
19 changed files with 311 additions and 122 deletions

View File

@ -3,6 +3,17 @@
This document contains details of the various releases and their release dates. This document contains details of the various releases and their release dates.
Dates are in the format `yyyy-mm-dd`. Dates are in the format `yyyy-mm-dd`.
## 2.4 - Unreleased
### Serialising Large Documents
Oga can now serialise large documents without causing the call stack to overflow
thanks to the new `Oga::XML::Generator` class. This class can generate XML
without using a stack at all.
See issue <https://github.com/YorickPeterse/oga/issues/158> for more
information.
## 2.3 - 2016-07-13 ## 2.3 - 2016-07-13
Thanks to various changes provided by Erik Michaels-Ober Oga can now be used to Thanks to various changes provided by Erik Michaels-Ober Oga can now be used to

View File

@ -23,6 +23,7 @@ if RUBY_PLATFORM == 'java'
end end
#:nocov: #:nocov:
require 'oga/xml/to_xml'
require 'oga/xml/html_void_elements' require 'oga/xml/html_void_elements'
require 'oga/xml/entities' require 'oga/xml/entities'
require 'oga/xml/querying' require 'oga/xml/querying'
@ -42,6 +43,7 @@ require 'oga/xml/default_namespace'
require 'oga/xml/attribute' require 'oga/xml/attribute'
require 'oga/xml/element' require 'oga/xml/element'
require 'oga/xml/node_set' require 'oga/xml/node_set'
require 'oga/xml/generator'
require 'oga/xml/sax_parser' require 'oga/xml/sax_parser'
require 'oga/xml/pull_parser' require 'oga/xml/pull_parser'

View File

@ -3,6 +3,7 @@ module Oga
# Class for storing information about a single XML attribute. # Class for storing information about a single XML attribute.
class Attribute class Attribute
include ExpandedName include ExpandedName
include ToXML
# The name of the attribute. # The name of the attribute.
# @return [String] # @return [String]
@ -81,19 +82,6 @@ module Oga
alias_method :to_s, :text alias_method :to_s, :text
# @return [String]
def to_xml
if namespace_name
full_name = "#{namespace_name}:#{name}"
else
full_name = name
end
enc_value = value ? Entities.encode_attribute(value) : nil
%Q(#{full_name}="#{enc_value}")
end
# @return [String] # @return [String]
def inspect def inspect
segments = [] segments = []

View File

@ -2,12 +2,6 @@ module Oga
module XML module XML
# Class used for storing information about CDATA tags. # Class used for storing information about CDATA tags.
class Cdata < CharacterNode class Cdata < CharacterNode
# Converts the node back to XML.
#
# @return [String]
def to_xml
"<![CDATA[#{text}]]>"
end
end # Cdata end # Cdata
end # XML end # XML
end # Oga end # Oga

View File

@ -15,11 +15,6 @@ module Oga
@text = options[:text] @text = options[:text]
end end
# @return [String]
def to_xml
text.to_s
end
# @return [String] # @return [String]
def inspect def inspect
"#{self.class.to_s.split('::').last}(#{text.inspect})" "#{self.class.to_s.split('::').last}(#{text.inspect})"

View File

@ -2,12 +2,6 @@ module Oga
module XML module XML
# Class used for storing information about XML comments. # Class used for storing information about XML comments.
class Comment < CharacterNode class Comment < CharacterNode
# Converts the node back to XML.
#
# @return [String]
def to_xml
"<!--#{text}-->"
end
end # Comment end # Comment
end # XML end # XML
end # Oga end # Oga

View File

@ -2,6 +2,8 @@ module Oga
module XML module XML
# Class used for storing information about Doctypes. # Class used for storing information about Doctypes.
class Doctype class Doctype
include ToXML
# The name of the doctype (e.g. "HTML"). # The name of the doctype (e.g. "HTML").
# @return [String] # @return [String]
attr_accessor :name attr_accessor :name
@ -39,20 +41,6 @@ module Oga
@inline_rules = options[:inline_rules] @inline_rules = options[:inline_rules]
end end
# Converts the doctype back to XML.
#
# @return [String]
def to_xml
segments = "<!DOCTYPE #{name}"
segments << " #{type}" if type
segments << %Q{ "#{public_id}"} if public_id
segments << %Q{ "#{system_id}"} if system_id
segments << " [#{inline_rules}]" if inline_rules
segments + '>'
end
# Inspects the doctype. # Inspects the doctype.
# #
# @return [String] # @return [String]

View File

@ -5,6 +5,7 @@ module Oga
class Document class Document
include Querying include Querying
include Traversal include Traversal
include ToXML
# @return [Oga::XML::Doctype] # @return [Oga::XML::Doctype]
attr_accessor :doctype attr_accessor :doctype
@ -56,23 +57,6 @@ module Oga
self self
end end
# Converts the document and its child nodes to XML.
#
# @return [String]
def to_xml
xml = children.map(&:to_xml).join('')
if doctype
xml = doctype.to_xml + "\n" + xml.strip
end
if xml_declaration
xml = xml_declaration.to_xml + "\n" + xml.strip
end
xml
end
# @return [TrueClass|FalseClass] # @return [TrueClass|FalseClass]
def html? def html?
type.equal?(:html) type.equal?(:html)
@ -99,6 +83,11 @@ Document(
) )
EOF EOF
end end
# @return [FalseClass]
def literal_html_name?
false
end
end # Document end # Document
end # XML end # XML
end # Oga end # Oga

View File

@ -211,30 +211,6 @@ module Oga
@children = NodeSet.new([text_node], self) @children = NodeSet.new([text_node], self)
end end
# Converts the element and its child elements to XML.
#
# @return [String]
def to_xml
if namespace_name
full_name = "#{namespace_name}:#{name}"
else
full_name = name
end
body = children.map(&:to_xml).join('')
attrs = ''
attributes.each do |attr|
attrs << " #{attr.to_xml}"
end
if self_closing?
return "<#{full_name}#{attrs} />"
else
return "<#{full_name}#{attrs}>#{body}</#{full_name}>"
end
end
# @return [String] # @return [String]
def inspect def inspect
segments = [] segments = []
@ -323,6 +299,14 @@ module Oga
end end
end end
# Returns true if the current element name is the name of one of the
# literal HTML elements.
#
# @return [TrueClass|FalseClass]
def literal_html_name?
Lexer::LITERAL_HTML_ELEMENTS.allow?(name)
end
private private
# Registers namespaces based on any "xmlns" attributes. # Registers namespaces based on any "xmlns" attributes.

198
lib/oga/xml/generator.rb Normal file
View File

@ -0,0 +1,198 @@
module Oga
module XML
# Class for generating XML as a String based on an existing document.
#
# Basic usage:
#
# element = Oga::XML::Element.new(name: 'root')
# element.inner_text = 'hello'
#
# gen = Oga::XML::Generator.new(element)
#
# gen.to_xml # => "<root>hello</root>"
#
# @private
class Generator
# @param [Oga::XML::Document|Oga::XML::Node] start The node to serialise.
def initialize(root)
@start = root
if @start.respond_to?(:root_node)
@html_mode = @start.root_node.html?
else
@html_mode = false
end
end
# Returns the XML for the current root node.
#
# @return [String]
def to_xml
current = @start
output = ''
while current
children = false
# Determine what callback to use for the current node. The order of
# this statement is based on how likely it is for an arm to match.
case current
when Oga::XML::Element
callback = :on_element
children = true
when Oga::XML::Text
callback = :on_text
when Oga::XML::Cdata
callback = :on_cdata
when Oga::XML::Comment
callback = :on_comment
when Oga::XML::Attribute
callback = :on_attribute
when Oga::XML::ProcessingInstruction
callback = :on_processing_instruction
when Oga::XML::Doctype
callback = :on_doctype
when Oga::XML::XmlDeclaration
callback = :on_xml_declaration
when Oga::XML::Document
callback = :on_document
children = true
else
raise TypeError, "Can't serialize #{current.class} to XML"
end
send(callback, current, output)
if child_node = children && current.children[0]
current = child_node
else
until next_node = current.is_a?(Node) && current.next
if current.is_a?(Node) && current != @start
current = current.parent
end
send(:after_element, current, output) if current.is_a?(Element)
break if current == @start
end
current = next_node
end
end
output
end
# @param [Oga::XML::Text] node
# @param [String] output
def on_text(node, output)
if @html_mode && (parent = node.parent) && parent.literal_html_name?
output << node.text
else
output << Entities.encode(node.text)
end
end
# @param [Oga::XML::Cdata] node
# @param [String] output
def on_cdata(node, output)
output << "<![CDATA[#{node.text}]]>"
end
# @param [Oga::XML::Comment] node
# @param [String] output
def on_comment(node, output)
output << "<!--#{node.text}-->"
end
# @param [Oga::XML::ProcessingInstruction] node
# @param [String] output
def on_processing_instruction(node, output)
output << "<?#{node.name}#{node.text}?>"
end
# @param [Oga::XML::Element] element
# @param [String] body The content of the element.
def on_element(element, output)
name = element.expanded_name
attrs = ''
element.attributes.each do |attr|
attrs << ' '
on_attribute(attr, attrs)
end
if self_closing?(element)
output << "<#{name}#{attrs} />"
else
output << "<#{name}#{attrs}>"
end
end
# @param [Oga::XML::Element] element
# @param [String] output
def after_element(element, output)
output << "</#{element.expanded_name}>" unless self_closing?(element)
end
# @param [Oga::XML::Attribute] attr
# @param [String] output
def on_attribute(attr, output)
name = attr.expanded_name
enc_value = attr.value ? Entities.encode_attribute(attr.value) : nil
output << %Q(#{name}="#{enc_value}")
end
# @param [Oga::XML::Doctype] node
# @param [String] output
def on_doctype(node, output)
output << "<!DOCTYPE #{node.name}"
output << " #{node.type}" if node.type
output << %Q{ "#{node.public_id}"} if node.public_id
output << %Q{ "#{node.system_id}"} if node.system_id
output << " [#{node.inline_rules}]" if node.inline_rules
output << '>'
end
# @param [Oga::XML::Document] node
# @param [String] output
def on_document(doc, output)
if doc.xml_declaration
on_xml_declaration(doc.xml_declaration, output)
output << "\n"
end
if doc.doctype
on_doctype(doc.doctype, output)
output << "\n"
end
end
# @param [Oga::XML::XmlDeclaration] node
# @param [String] output
def on_xml_declaration(node, output)
output << '<?xml'
[:version, :encoding, :standalone].each do |getter|
value = node.send(getter)
output << %Q{ #{getter}="#{value}"} if value
end
output << ' ?>'
end
# @param [Oga::XML::Element] element
# @return [TrueClass|FalseClass]
def self_closing?(element)
if @html_mode && !HTML_VOID_ELEMENTS.allow?(element.name)
false
else
element.children.empty?
end
end
end
end
end

View File

@ -5,6 +5,7 @@ module Oga
# nodes. # nodes.
class Node class Node
include Traversal include Traversal
include ToXML
# @return [Oga::XML::NodeSet] # @return [Oga::XML::NodeSet]
attr_reader :node_set attr_reader :node_set

View File

@ -15,11 +15,6 @@ module Oga
@name = options[:name] @name = options[:name]
end end
# @return [String]
def to_xml
"<?#{name}#{text}?>"
end
# @return [String] # @return [String]
def inspect def inspect
"ProcessingInstruction(name: #{name.inspect} text: #{text.inspect})" "ProcessingInstruction(name: #{name.inspect} text: #{text.inspect})"

View File

@ -28,15 +28,6 @@ module Oga
@text @text
end end
# @see [Oga::XML::CharacterNode#to_xml]
def to_xml
return super if inside_literal_html?
Entities.encode(super)
end
private
# @return [TrueClass|FalseClass] # @return [TrueClass|FalseClass]
def decode_entities? def decode_entities?
!@decoded && !inside_literal_html? !@decoded && !inside_literal_html?
@ -46,8 +37,7 @@ module Oga
def inside_literal_html? def inside_literal_html?
node = parent node = parent
node.is_a?(Element) && html? && node && html? && node.literal_html_name?
Lexer::LITERAL_HTML_ELEMENTS.allow?(node.name)
end end
end # Text end # Text
end # XML end # XML

12
lib/oga/xml/to_xml.rb Normal file
View File

@ -0,0 +1,12 @@
module Oga
module XML
# Module that provides a `#to_xml` method that serialises the current node
# back to XML.
module ToXML
# @return [String]
def to_xml
Generator.new(self).to_xml
end
end
end
end

View File

@ -2,6 +2,8 @@ module Oga
module XML module XML
# Class containing information about an XML declaration tag. # Class containing information about an XML declaration tag.
class XmlDeclaration class XmlDeclaration
include ToXML
# @return [String] # @return [String]
attr_accessor :version attr_accessor :version
@ -23,21 +25,6 @@ module Oga
@standalone = options[:standalone] @standalone = options[:standalone]
end end
# Converts the declaration tag to XML.
#
# @return [String]
def to_xml
pairs = []
[:version, :encoding, :standalone].each do |getter|
value = send(getter)
pairs << %Q{#{getter}="#{value}"} if value
end
"<?xml #{pairs.join(' ')} ?>"
end
# @return [String] # @return [String]
def inspect def inspect
segments = [] segments = []

View File

@ -14,12 +14,6 @@ describe Oga::XML::CharacterNode do
end end
end end
describe '#to_xml' do
it 'converts the node to XML' do
described_class.new(:text => 'a').to_xml.should == 'a'
end
end
describe '#inspect' do describe '#inspect' do
it 'returns the inspect value' do it 'returns the inspect value' do
described_class.new(:text => 'a').inspect.should == 'CharacterNode("a")' described_class.new(:text => 'a').inspect.should == 'CharacterNode("a")'

View File

@ -142,4 +142,10 @@ Document(
EOF EOF
end end
end end
describe '#literal_html_name?' do
it 'returns false' do
described_class.new.literal_html_name?.should == false
end
end
end end

View File

@ -636,4 +636,14 @@ describe Oga::XML::Element do
end end
end end
end end
describe '#literal_html_name?' do
it 'returns true for an element name matching one of the literal HTML elements' do
described_class.new(:name => 'script').literal_html_name?.should == true
end
it 'returns false for an element name not matching one of the literal HTML elements' do
described_class.new(:name => 'foo').literal_html_name?.should == false
end
end
end end

View File

@ -0,0 +1,51 @@
require 'spec_helper'
describe Oga::XML::Generator do
describe '#to_xml' do
describe 'using an unsupported root type' do
it 'raises TypeError' do
-> { described_class.new(:foo).to_xml }.should raise_error(TypeError)
end
end
describe 'using an Element as the root node' do
it 'returns a String' do
element = Oga::XML::Element.new(name: 'foo')
element.set('attr', 'value')
output = described_class.new(element).to_xml
output.should == '<foo attr="value" />'
end
end
describe 'using a Document as the root node' do
it 'returns a String' do
element = Oga::XML::Element.new(name: 'foo')
doc = Oga::XML::Document.new(children: [element])
output = described_class.new(doc).to_xml
output.should == '<foo />'
end
end
describe 'using Element nodes with siblings' do
it 'returns a String' do
root = Oga::XML::Element.new(
name: 'root',
children: [
Oga::XML::Element.new(name: 'a'),
Oga::XML::Element.new(
name: 'b',
children: [Oga::XML::Element.new(name: 'c')]
)
]
)
output = described_class.new(root).to_xml
output.should == '<root><a /><b><c /></b></root>'
end
end
end
end