diff --git a/CHANGELOG.md b/CHANGELOG.md index e053d68..a4a3faa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,17 @@ This document contains details of the various releases and their release dates. Dates are in the format `yyyy-mm-dd`. +## 2.4 - Unreleased + +### Serialising Large Documents + +Oga can now serialise large documents without causing the call stack to overflow +thanks to the new `Oga::XML::Generator` class. This class can generate XML +without using a stack at all. + +See issue for more +information. + ## 2.3 - 2016-07-13 Thanks to various changes provided by Erik Michaels-Ober Oga can now be used to diff --git a/lib/oga.rb b/lib/oga.rb index d87cc87..3bd9081 100644 --- a/lib/oga.rb +++ b/lib/oga.rb @@ -23,6 +23,7 @@ if RUBY_PLATFORM == 'java' end #:nocov: +require 'oga/xml/to_xml' require 'oga/xml/html_void_elements' require 'oga/xml/entities' require 'oga/xml/querying' @@ -42,6 +43,7 @@ require 'oga/xml/default_namespace' require 'oga/xml/attribute' require 'oga/xml/element' require 'oga/xml/node_set' +require 'oga/xml/generator' require 'oga/xml/sax_parser' require 'oga/xml/pull_parser' diff --git a/lib/oga/xml/attribute.rb b/lib/oga/xml/attribute.rb index c0496e1..7ae5222 100644 --- a/lib/oga/xml/attribute.rb +++ b/lib/oga/xml/attribute.rb @@ -3,6 +3,7 @@ module Oga # Class for storing information about a single XML attribute. class Attribute include ExpandedName + include ToXML # The name of the attribute. # @return [String] @@ -81,19 +82,6 @@ module Oga alias_method :to_s, :text - # @return [String] - def to_xml - if namespace_name - full_name = "#{namespace_name}:#{name}" - else - full_name = name - end - - enc_value = value ? Entities.encode_attribute(value) : nil - - %Q(#{full_name}="#{enc_value}") - end - # @return [String] def inspect segments = [] diff --git a/lib/oga/xml/cdata.rb b/lib/oga/xml/cdata.rb index 22180bc..439b096 100644 --- a/lib/oga/xml/cdata.rb +++ b/lib/oga/xml/cdata.rb @@ -2,12 +2,6 @@ module Oga module XML # Class used for storing information about CDATA tags. class Cdata < CharacterNode - # Converts the node back to XML. - # - # @return [String] - def to_xml - "" - end end # Cdata end # XML end # Oga diff --git a/lib/oga/xml/character_node.rb b/lib/oga/xml/character_node.rb index 5206aa8..8387028 100644 --- a/lib/oga/xml/character_node.rb +++ b/lib/oga/xml/character_node.rb @@ -15,11 +15,6 @@ module Oga @text = options[:text] end - # @return [String] - def to_xml - text.to_s - end - # @return [String] def inspect "#{self.class.to_s.split('::').last}(#{text.inspect})" diff --git a/lib/oga/xml/comment.rb b/lib/oga/xml/comment.rb index 31fed66..b5a3f90 100644 --- a/lib/oga/xml/comment.rb +++ b/lib/oga/xml/comment.rb @@ -2,12 +2,6 @@ module Oga module XML # Class used for storing information about XML comments. class Comment < CharacterNode - # Converts the node back to XML. - # - # @return [String] - def to_xml - "" - end end # Comment end # XML end # Oga diff --git a/lib/oga/xml/doctype.rb b/lib/oga/xml/doctype.rb index feb8593..c5f5bba 100644 --- a/lib/oga/xml/doctype.rb +++ b/lib/oga/xml/doctype.rb @@ -2,6 +2,8 @@ module Oga module XML # Class used for storing information about Doctypes. class Doctype + include ToXML + # The name of the doctype (e.g. "HTML"). # @return [String] attr_accessor :name @@ -39,20 +41,6 @@ module Oga @inline_rules = options[:inline_rules] end - # Converts the doctype back to XML. - # - # @return [String] - def to_xml - segments = "' - end - # Inspects the doctype. # # @return [String] diff --git a/lib/oga/xml/document.rb b/lib/oga/xml/document.rb index b2357da..fd4c0ee 100644 --- a/lib/oga/xml/document.rb +++ b/lib/oga/xml/document.rb @@ -5,6 +5,7 @@ module Oga class Document include Querying include Traversal + include ToXML # @return [Oga::XML::Doctype] attr_accessor :doctype @@ -56,23 +57,6 @@ module Oga self end - # Converts the document and its child nodes to XML. - # - # @return [String] - def to_xml - xml = children.map(&:to_xml).join('') - - if doctype - xml = doctype.to_xml + "\n" + xml.strip - end - - if xml_declaration - xml = xml_declaration.to_xml + "\n" + xml.strip - end - - xml - end - # @return [TrueClass|FalseClass] def html? type.equal?(:html) @@ -99,6 +83,11 @@ Document( ) EOF end + + # @return [FalseClass] + def literal_html_name? + false + end end # Document end # XML end # Oga diff --git a/lib/oga/xml/element.rb b/lib/oga/xml/element.rb index c513f3c..a2732d1 100644 --- a/lib/oga/xml/element.rb +++ b/lib/oga/xml/element.rb @@ -211,30 +211,6 @@ module Oga @children = NodeSet.new([text_node], self) end - # Converts the element and its child elements to XML. - # - # @return [String] - def to_xml - if namespace_name - full_name = "#{namespace_name}:#{name}" - else - full_name = name - end - - body = children.map(&:to_xml).join('') - attrs = '' - - attributes.each do |attr| - attrs << " #{attr.to_xml}" - end - - if self_closing? - return "<#{full_name}#{attrs} />" - else - return "<#{full_name}#{attrs}>#{body}" - end - end - # @return [String] def inspect segments = [] @@ -323,6 +299,14 @@ module Oga end end + # Returns true if the current element name is the name of one of the + # literal HTML elements. + # + # @return [TrueClass|FalseClass] + def literal_html_name? + Lexer::LITERAL_HTML_ELEMENTS.allow?(name) + end + private # Registers namespaces based on any "xmlns" attributes. diff --git a/lib/oga/xml/generator.rb b/lib/oga/xml/generator.rb new file mode 100644 index 0000000..f47d959 --- /dev/null +++ b/lib/oga/xml/generator.rb @@ -0,0 +1,198 @@ +module Oga + module XML + # Class for generating XML as a String based on an existing document. + # + # Basic usage: + # + # element = Oga::XML::Element.new(name: 'root') + # element.inner_text = 'hello' + # + # gen = Oga::XML::Generator.new(element) + # + # gen.to_xml # => "hello" + # + # @private + class Generator + # @param [Oga::XML::Document|Oga::XML::Node] start The node to serialise. + def initialize(root) + @start = root + + if @start.respond_to?(:root_node) + @html_mode = @start.root_node.html? + else + @html_mode = false + end + end + + # Returns the XML for the current root node. + # + # @return [String] + def to_xml + current = @start + output = '' + + while current + children = false + + # Determine what callback to use for the current node. The order of + # this statement is based on how likely it is for an arm to match. + case current + when Oga::XML::Element + callback = :on_element + children = true + when Oga::XML::Text + callback = :on_text + when Oga::XML::Cdata + callback = :on_cdata + when Oga::XML::Comment + callback = :on_comment + when Oga::XML::Attribute + callback = :on_attribute + when Oga::XML::ProcessingInstruction + callback = :on_processing_instruction + when Oga::XML::Doctype + callback = :on_doctype + when Oga::XML::XmlDeclaration + callback = :on_xml_declaration + when Oga::XML::Document + callback = :on_document + children = true + else + raise TypeError, "Can't serialize #{current.class} to XML" + end + + send(callback, current, output) + + if child_node = children && current.children[0] + current = child_node + else + until next_node = current.is_a?(Node) && current.next + if current.is_a?(Node) && current != @start + current = current.parent + end + + send(:after_element, current, output) if current.is_a?(Element) + + break if current == @start + end + + current = next_node + end + end + + output + end + + # @param [Oga::XML::Text] node + # @param [String] output + def on_text(node, output) + if @html_mode && (parent = node.parent) && parent.literal_html_name? + output << node.text + else + output << Entities.encode(node.text) + end + end + + # @param [Oga::XML::Cdata] node + # @param [String] output + def on_cdata(node, output) + output << "" + end + + # @param [Oga::XML::Comment] node + # @param [String] output + def on_comment(node, output) + output << "" + end + + # @param [Oga::XML::ProcessingInstruction] node + # @param [String] output + def on_processing_instruction(node, output) + output << "" + end + + # @param [Oga::XML::Element] element + # @param [String] body The content of the element. + def on_element(element, output) + name = element.expanded_name + attrs = '' + + element.attributes.each do |attr| + attrs << ' ' + on_attribute(attr, attrs) + end + + if self_closing?(element) + output << "<#{name}#{attrs} />" + else + output << "<#{name}#{attrs}>" + end + end + + # @param [Oga::XML::Element] element + # @param [String] output + def after_element(element, output) + output << "" unless self_closing?(element) + end + + # @param [Oga::XML::Attribute] attr + # @param [String] output + def on_attribute(attr, output) + name = attr.expanded_name + enc_value = attr.value ? Entities.encode_attribute(attr.value) : nil + + output << %Q(#{name}="#{enc_value}") + end + + # @param [Oga::XML::Doctype] node + # @param [String] output + def on_doctype(node, output) + output << "' + end + + # @param [Oga::XML::Document] node + # @param [String] output + def on_document(doc, output) + if doc.xml_declaration + on_xml_declaration(doc.xml_declaration, output) + output << "\n" + end + + if doc.doctype + on_doctype(doc.doctype, output) + output << "\n" + end + end + + # @param [Oga::XML::XmlDeclaration] node + # @param [String] output + def on_xml_declaration(node, output) + output << '' + end + + # @param [Oga::XML::Element] element + # @return [TrueClass|FalseClass] + def self_closing?(element) + if @html_mode && !HTML_VOID_ELEMENTS.allow?(element.name) + false + else + element.children.empty? + end + end + end + end +end diff --git a/lib/oga/xml/node.rb b/lib/oga/xml/node.rb index 457882e..004b236 100644 --- a/lib/oga/xml/node.rb +++ b/lib/oga/xml/node.rb @@ -5,6 +5,7 @@ module Oga # nodes. class Node include Traversal + include ToXML # @return [Oga::XML::NodeSet] attr_reader :node_set diff --git a/lib/oga/xml/processing_instruction.rb b/lib/oga/xml/processing_instruction.rb index be05af7..c225c23 100644 --- a/lib/oga/xml/processing_instruction.rb +++ b/lib/oga/xml/processing_instruction.rb @@ -15,11 +15,6 @@ module Oga @name = options[:name] end - # @return [String] - def to_xml - "" - end - # @return [String] def inspect "ProcessingInstruction(name: #{name.inspect} text: #{text.inspect})" diff --git a/lib/oga/xml/text.rb b/lib/oga/xml/text.rb index 2ee1734..20e013c 100644 --- a/lib/oga/xml/text.rb +++ b/lib/oga/xml/text.rb @@ -28,15 +28,6 @@ module Oga @text end - # @see [Oga::XML::CharacterNode#to_xml] - def to_xml - return super if inside_literal_html? - - Entities.encode(super) - end - - private - # @return [TrueClass|FalseClass] def decode_entities? !@decoded && !inside_literal_html? @@ -46,8 +37,7 @@ module Oga def inside_literal_html? node = parent - node.is_a?(Element) && html? && - Lexer::LITERAL_HTML_ELEMENTS.allow?(node.name) + node && html? && node.literal_html_name? end end # Text end # XML diff --git a/lib/oga/xml/to_xml.rb b/lib/oga/xml/to_xml.rb new file mode 100644 index 0000000..fd56ffe --- /dev/null +++ b/lib/oga/xml/to_xml.rb @@ -0,0 +1,12 @@ +module Oga + module XML + # Module that provides a `#to_xml` method that serialises the current node + # back to XML. + module ToXML + # @return [String] + def to_xml + Generator.new(self).to_xml + end + end + end +end diff --git a/lib/oga/xml/xml_declaration.rb b/lib/oga/xml/xml_declaration.rb index 2ad6e16..f5b6268 100644 --- a/lib/oga/xml/xml_declaration.rb +++ b/lib/oga/xml/xml_declaration.rb @@ -2,6 +2,8 @@ module Oga module XML # Class containing information about an XML declaration tag. class XmlDeclaration + include ToXML + # @return [String] attr_accessor :version @@ -23,21 +25,6 @@ module Oga @standalone = options[:standalone] end - # Converts the declaration tag to XML. - # - # @return [String] - def to_xml - pairs = [] - - [:version, :encoding, :standalone].each do |getter| - value = send(getter) - - pairs << %Q{#{getter}="#{value}"} if value - end - - "" - end - # @return [String] def inspect segments = [] diff --git a/spec/oga/xml/character_node_spec.rb b/spec/oga/xml/character_node_spec.rb index e21c681..69ad9a5 100644 --- a/spec/oga/xml/character_node_spec.rb +++ b/spec/oga/xml/character_node_spec.rb @@ -14,12 +14,6 @@ describe Oga::XML::CharacterNode do end end - describe '#to_xml' do - it 'converts the node to XML' do - described_class.new(:text => 'a').to_xml.should == 'a' - end - end - describe '#inspect' do it 'returns the inspect value' do described_class.new(:text => 'a').inspect.should == 'CharacterNode("a")' diff --git a/spec/oga/xml/document_spec.rb b/spec/oga/xml/document_spec.rb index 119d600..6c2ed33 100644 --- a/spec/oga/xml/document_spec.rb +++ b/spec/oga/xml/document_spec.rb @@ -142,4 +142,10 @@ Document( EOF end end + + describe '#literal_html_name?' do + it 'returns false' do + described_class.new.literal_html_name?.should == false + end + end end diff --git a/spec/oga/xml/element_spec.rb b/spec/oga/xml/element_spec.rb index d43c6e5..9bfd17f 100644 --- a/spec/oga/xml/element_spec.rb +++ b/spec/oga/xml/element_spec.rb @@ -636,4 +636,14 @@ describe Oga::XML::Element do end end end + + describe '#literal_html_name?' do + it 'returns true for an element name matching one of the literal HTML elements' do + described_class.new(:name => 'script').literal_html_name?.should == true + end + + it 'returns false for an element name not matching one of the literal HTML elements' do + described_class.new(:name => 'foo').literal_html_name?.should == false + end + end end diff --git a/spec/oga/xml/generator_spec.rb b/spec/oga/xml/generator_spec.rb new file mode 100644 index 0000000..683daea --- /dev/null +++ b/spec/oga/xml/generator_spec.rb @@ -0,0 +1,51 @@ +require 'spec_helper' + +describe Oga::XML::Generator do + describe '#to_xml' do + describe 'using an unsupported root type' do + it 'raises TypeError' do + -> { described_class.new(:foo).to_xml }.should raise_error(TypeError) + end + end + + describe 'using an Element as the root node' do + it 'returns a String' do + element = Oga::XML::Element.new(name: 'foo') + element.set('attr', 'value') + + output = described_class.new(element).to_xml + + output.should == '' + end + end + + describe 'using a Document as the root node' do + it 'returns a String' do + element = Oga::XML::Element.new(name: 'foo') + doc = Oga::XML::Document.new(children: [element]) + output = described_class.new(doc).to_xml + + output.should == '' + end + end + + describe 'using Element nodes with siblings' do + it 'returns a String' do + root = Oga::XML::Element.new( + name: 'root', + children: [ + Oga::XML::Element.new(name: 'a'), + Oga::XML::Element.new( + name: 'b', + children: [Oga::XML::Element.new(name: 'c')] + ) + ] + ) + + output = described_class.new(root).to_xml + + output.should == '' + end + end + end +end