Generate XML without relying on recursion

While using recursion is an easy way of generating XML it can lead to
the call stack overflowing when serialising documents with lots of
nested nodes.

Generally there are two ways of working around this:

1. Use an explicit stack (e.g. an array or a queue of sorts) instead of
   relying on the call stack.
2. Use an algorithm that doesn't use a stack at all (e.g. Morris
   traversal).

This commit introduces the XML::Generator class which can serialize
documents back to XML without using a stack at all. This class takes
advantage of XML nodes having access to not only their child nodes, but
also their siblings and their parents.

All XML serialisation logic now resides in the XML::Generator class. In
turn the various "to_xml" methods just use this class and serialize
everything starting at "self".
This commit is contained in:
Yorick Peterse 2016-09-03 21:24:19 +02:00
parent 9ac16e2e4f
commit dd138981f6
No known key found for this signature in database
GPG Key ID: EDD30D2BEB691AC9
19 changed files with 311 additions and 122 deletions

View File

@ -3,6 +3,17 @@
This document contains details of the various releases and their release dates.
Dates are in the format `yyyy-mm-dd`.
## 2.4 - Unreleased
### Serialising Large Documents
Oga can now serialise large documents without causing the call stack to overflow
thanks to the new `Oga::XML::Generator` class. This class can generate XML
without using a stack at all.
See issue <https://github.com/YorickPeterse/oga/issues/158> for more
information.
## 2.3 - 2016-07-13
Thanks to various changes provided by Erik Michaels-Ober Oga can now be used to

View File

@ -23,6 +23,7 @@ if RUBY_PLATFORM == 'java'
end
#:nocov:
require 'oga/xml/to_xml'
require 'oga/xml/html_void_elements'
require 'oga/xml/entities'
require 'oga/xml/querying'
@ -42,6 +43,7 @@ require 'oga/xml/default_namespace'
require 'oga/xml/attribute'
require 'oga/xml/element'
require 'oga/xml/node_set'
require 'oga/xml/generator'
require 'oga/xml/sax_parser'
require 'oga/xml/pull_parser'

View File

@ -3,6 +3,7 @@ module Oga
# Class for storing information about a single XML attribute.
class Attribute
include ExpandedName
include ToXML
# The name of the attribute.
# @return [String]
@ -81,19 +82,6 @@ module Oga
alias_method :to_s, :text
# @return [String]
def to_xml
if namespace_name
full_name = "#{namespace_name}:#{name}"
else
full_name = name
end
enc_value = value ? Entities.encode_attribute(value) : nil
%Q(#{full_name}="#{enc_value}")
end
# @return [String]
def inspect
segments = []

View File

@ -2,12 +2,6 @@ module Oga
module XML
# Class used for storing information about CDATA tags.
class Cdata < CharacterNode
# Converts the node back to XML.
#
# @return [String]
def to_xml
"<![CDATA[#{text}]]>"
end
end # Cdata
end # XML
end # Oga

View File

@ -15,11 +15,6 @@ module Oga
@text = options[:text]
end
# @return [String]
def to_xml
text.to_s
end
# @return [String]
def inspect
"#{self.class.to_s.split('::').last}(#{text.inspect})"

View File

@ -2,12 +2,6 @@ module Oga
module XML
# Class used for storing information about XML comments.
class Comment < CharacterNode
# Converts the node back to XML.
#
# @return [String]
def to_xml
"<!--#{text}-->"
end
end # Comment
end # XML
end # Oga

View File

@ -2,6 +2,8 @@ module Oga
module XML
# Class used for storing information about Doctypes.
class Doctype
include ToXML
# The name of the doctype (e.g. "HTML").
# @return [String]
attr_accessor :name
@ -39,20 +41,6 @@ module Oga
@inline_rules = options[:inline_rules]
end
# Converts the doctype back to XML.
#
# @return [String]
def to_xml
segments = "<!DOCTYPE #{name}"
segments << " #{type}" if type
segments << %Q{ "#{public_id}"} if public_id
segments << %Q{ "#{system_id}"} if system_id
segments << " [#{inline_rules}]" if inline_rules
segments + '>'
end
# Inspects the doctype.
#
# @return [String]

View File

@ -5,6 +5,7 @@ module Oga
class Document
include Querying
include Traversal
include ToXML
# @return [Oga::XML::Doctype]
attr_accessor :doctype
@ -56,23 +57,6 @@ module Oga
self
end
# Converts the document and its child nodes to XML.
#
# @return [String]
def to_xml
xml = children.map(&:to_xml).join('')
if doctype
xml = doctype.to_xml + "\n" + xml.strip
end
if xml_declaration
xml = xml_declaration.to_xml + "\n" + xml.strip
end
xml
end
# @return [TrueClass|FalseClass]
def html?
type.equal?(:html)
@ -99,6 +83,11 @@ Document(
)
EOF
end
# @return [FalseClass]
def literal_html_name?
false
end
end # Document
end # XML
end # Oga

View File

@ -211,30 +211,6 @@ module Oga
@children = NodeSet.new([text_node], self)
end
# Converts the element and its child elements to XML.
#
# @return [String]
def to_xml
if namespace_name
full_name = "#{namespace_name}:#{name}"
else
full_name = name
end
body = children.map(&:to_xml).join('')
attrs = ''
attributes.each do |attr|
attrs << " #{attr.to_xml}"
end
if self_closing?
return "<#{full_name}#{attrs} />"
else
return "<#{full_name}#{attrs}>#{body}</#{full_name}>"
end
end
# @return [String]
def inspect
segments = []
@ -323,6 +299,14 @@ module Oga
end
end
# Returns true if the current element name is the name of one of the
# literal HTML elements.
#
# @return [TrueClass|FalseClass]
def literal_html_name?
Lexer::LITERAL_HTML_ELEMENTS.allow?(name)
end
private
# Registers namespaces based on any "xmlns" attributes.

198
lib/oga/xml/generator.rb Normal file
View File

@ -0,0 +1,198 @@
module Oga
module XML
# Class for generating XML as a String based on an existing document.
#
# Basic usage:
#
# element = Oga::XML::Element.new(name: 'root')
# element.inner_text = 'hello'
#
# gen = Oga::XML::Generator.new(element)
#
# gen.to_xml # => "<root>hello</root>"
#
# @private
class Generator
# @param [Oga::XML::Document|Oga::XML::Node] start The node to serialise.
def initialize(root)
@start = root
if @start.respond_to?(:root_node)
@html_mode = @start.root_node.html?
else
@html_mode = false
end
end
# Returns the XML for the current root node.
#
# @return [String]
def to_xml
current = @start
output = ''
while current
children = false
# Determine what callback to use for the current node. The order of
# this statement is based on how likely it is for an arm to match.
case current
when Oga::XML::Element
callback = :on_element
children = true
when Oga::XML::Text
callback = :on_text
when Oga::XML::Cdata
callback = :on_cdata
when Oga::XML::Comment
callback = :on_comment
when Oga::XML::Attribute
callback = :on_attribute
when Oga::XML::ProcessingInstruction
callback = :on_processing_instruction
when Oga::XML::Doctype
callback = :on_doctype
when Oga::XML::XmlDeclaration
callback = :on_xml_declaration
when Oga::XML::Document
callback = :on_document
children = true
else
raise TypeError, "Can't serialize #{current.class} to XML"
end
send(callback, current, output)
if child_node = children && current.children[0]
current = child_node
else
until next_node = current.is_a?(Node) && current.next
if current.is_a?(Node) && current != @start
current = current.parent
end
send(:after_element, current, output) if current.is_a?(Element)
break if current == @start
end
current = next_node
end
end
output
end
# @param [Oga::XML::Text] node
# @param [String] output
def on_text(node, output)
if @html_mode && (parent = node.parent) && parent.literal_html_name?
output << node.text
else
output << Entities.encode(node.text)
end
end
# @param [Oga::XML::Cdata] node
# @param [String] output
def on_cdata(node, output)
output << "<![CDATA[#{node.text}]]>"
end
# @param [Oga::XML::Comment] node
# @param [String] output
def on_comment(node, output)
output << "<!--#{node.text}-->"
end
# @param [Oga::XML::ProcessingInstruction] node
# @param [String] output
def on_processing_instruction(node, output)
output << "<?#{node.name}#{node.text}?>"
end
# @param [Oga::XML::Element] element
# @param [String] body The content of the element.
def on_element(element, output)
name = element.expanded_name
attrs = ''
element.attributes.each do |attr|
attrs << ' '
on_attribute(attr, attrs)
end
if self_closing?(element)
output << "<#{name}#{attrs} />"
else
output << "<#{name}#{attrs}>"
end
end
# @param [Oga::XML::Element] element
# @param [String] output
def after_element(element, output)
output << "</#{element.expanded_name}>" unless self_closing?(element)
end
# @param [Oga::XML::Attribute] attr
# @param [String] output
def on_attribute(attr, output)
name = attr.expanded_name
enc_value = attr.value ? Entities.encode_attribute(attr.value) : nil
output << %Q(#{name}="#{enc_value}")
end
# @param [Oga::XML::Doctype] node
# @param [String] output
def on_doctype(node, output)
output << "<!DOCTYPE #{node.name}"
output << " #{node.type}" if node.type
output << %Q{ "#{node.public_id}"} if node.public_id
output << %Q{ "#{node.system_id}"} if node.system_id
output << " [#{node.inline_rules}]" if node.inline_rules
output << '>'
end
# @param [Oga::XML::Document] node
# @param [String] output
def on_document(doc, output)
if doc.xml_declaration
on_xml_declaration(doc.xml_declaration, output)
output << "\n"
end
if doc.doctype
on_doctype(doc.doctype, output)
output << "\n"
end
end
# @param [Oga::XML::XmlDeclaration] node
# @param [String] output
def on_xml_declaration(node, output)
output << '<?xml'
[:version, :encoding, :standalone].each do |getter|
value = node.send(getter)
output << %Q{ #{getter}="#{value}"} if value
end
output << ' ?>'
end
# @param [Oga::XML::Element] element
# @return [TrueClass|FalseClass]
def self_closing?(element)
if @html_mode && !HTML_VOID_ELEMENTS.allow?(element.name)
false
else
element.children.empty?
end
end
end
end
end

View File

@ -5,6 +5,7 @@ module Oga
# nodes.
class Node
include Traversal
include ToXML
# @return [Oga::XML::NodeSet]
attr_reader :node_set

View File

@ -15,11 +15,6 @@ module Oga
@name = options[:name]
end
# @return [String]
def to_xml
"<?#{name}#{text}?>"
end
# @return [String]
def inspect
"ProcessingInstruction(name: #{name.inspect} text: #{text.inspect})"

View File

@ -28,15 +28,6 @@ module Oga
@text
end
# @see [Oga::XML::CharacterNode#to_xml]
def to_xml
return super if inside_literal_html?
Entities.encode(super)
end
private
# @return [TrueClass|FalseClass]
def decode_entities?
!@decoded && !inside_literal_html?
@ -46,8 +37,7 @@ module Oga
def inside_literal_html?
node = parent
node.is_a?(Element) && html? &&
Lexer::LITERAL_HTML_ELEMENTS.allow?(node.name)
node && html? && node.literal_html_name?
end
end # Text
end # XML

12
lib/oga/xml/to_xml.rb Normal file
View File

@ -0,0 +1,12 @@
module Oga
module XML
# Module that provides a `#to_xml` method that serialises the current node
# back to XML.
module ToXML
# @return [String]
def to_xml
Generator.new(self).to_xml
end
end
end
end

View File

@ -2,6 +2,8 @@ module Oga
module XML
# Class containing information about an XML declaration tag.
class XmlDeclaration
include ToXML
# @return [String]
attr_accessor :version
@ -23,21 +25,6 @@ module Oga
@standalone = options[:standalone]
end
# Converts the declaration tag to XML.
#
# @return [String]
def to_xml
pairs = []
[:version, :encoding, :standalone].each do |getter|
value = send(getter)
pairs << %Q{#{getter}="#{value}"} if value
end
"<?xml #{pairs.join(' ')} ?>"
end
# @return [String]
def inspect
segments = []

View File

@ -14,12 +14,6 @@ describe Oga::XML::CharacterNode do
end
end
describe '#to_xml' do
it 'converts the node to XML' do
described_class.new(:text => 'a').to_xml.should == 'a'
end
end
describe '#inspect' do
it 'returns the inspect value' do
described_class.new(:text => 'a').inspect.should == 'CharacterNode("a")'

View File

@ -142,4 +142,10 @@ Document(
EOF
end
end
describe '#literal_html_name?' do
it 'returns false' do
described_class.new.literal_html_name?.should == false
end
end
end

View File

@ -636,4 +636,14 @@ describe Oga::XML::Element do
end
end
end
describe '#literal_html_name?' do
it 'returns true for an element name matching one of the literal HTML elements' do
described_class.new(:name => 'script').literal_html_name?.should == true
end
it 'returns false for an element name not matching one of the literal HTML elements' do
described_class.new(:name => 'foo').literal_html_name?.should == false
end
end
end

View File

@ -0,0 +1,51 @@
require 'spec_helper'
describe Oga::XML::Generator do
describe '#to_xml' do
describe 'using an unsupported root type' do
it 'raises TypeError' do
-> { described_class.new(:foo).to_xml }.should raise_error(TypeError)
end
end
describe 'using an Element as the root node' do
it 'returns a String' do
element = Oga::XML::Element.new(name: 'foo')
element.set('attr', 'value')
output = described_class.new(element).to_xml
output.should == '<foo attr="value" />'
end
end
describe 'using a Document as the root node' do
it 'returns a String' do
element = Oga::XML::Element.new(name: 'foo')
doc = Oga::XML::Document.new(children: [element])
output = described_class.new(doc).to_xml
output.should == '<foo />'
end
end
describe 'using Element nodes with siblings' do
it 'returns a String' do
root = Oga::XML::Element.new(
name: 'root',
children: [
Oga::XML::Element.new(name: 'a'),
Oga::XML::Element.new(
name: 'b',
children: [Oga::XML::Element.new(name: 'c')]
)
]
)
output = described_class.new(root).to_xml
output.should == '<root><a /><b><c /></b></root>'
end
end
end
end