oga/lib/oga/xml/pull_parser.rb

166 lines
3.8 KiB
Ruby

module Oga
module XML
##
# The PullParser class can be used to parse an XML document incrementally
# instead of parsing it as a whole. This results in lower memory usage and
# potentially faster parsing times. The downside is that pull parsers are
# typically more difficult to use compared to DOM parsers.
#
# Basic parsing using this class works as following:
#
# parser = Oga::XML::PullParser.new('... xml here ...')
#
# parser.parse do |node|
# if node.is_a?(Oga::XML::PullParser)
#
# end
# end
#
# This parses yields proper XML instances such as {Oga::XML::Element}.
# Doctypes and XML declarations are ignored by this parser.
#
# @!attribute [r] node
# The current node.
# @return [Oga::XML::Node]
#
# @!attribute [r] nesting
# Array containing the names of the currently nested elements.
# @return [Array]
#
class PullParser < Parser
attr_reader :node, :nesting
##
# @return [Array]
#
DISABLED_CALLBACKS = [
:on_document,
:on_doctype,
:on_xml_decl,
:on_element_children
]
##
# @return [Array]
#
BLOCK_CALLBACKS = [
:on_cdata,
:on_comment,
:on_text
]
##
# @see Oga::XML::Parser#reset
#
def reset
super
@block = nil
@nesting = []
@node = nil
end
##
# Parses the input and yields every node to the supplied block.
#
# @yieldparam [Oga::XML::Node]
#
def parse(&block)
@block = block
yyparse(self, :yield_next_token)
reset
return
end
##
# Calls the supplied block if the current node type and optionally the
# nesting match. This method allows you to write this:
#
# parser.parse do |node|
# parser.on(:text, %w{people person name}) do
# puts node.text
# end
# end
#
# Instead of this:
#
# parser.parse do |node|
# if node.node_type == :text \
# and parser.nesting == %w{people person name}
# puts node.text
# end
# end
#
# When calling this method you can specify the following node types:
#
# * `:cdata`
# * `:comment`
# * `:element`
# * `:text`
#
# @example
# parser.on(:element, %w{people person name}) do
#
# end
#
# @param [Symbol] type The type of node to act upon. This is a symbol as
# returned by {Oga::XML::Node#node_type}.
#
# @param [Array] nesting The element name nesting to act upon.
#
def on(type, nesting = [])
if node.node_type == type
if nesting.empty? or nesting == self.nesting
yield
end
end
end
# eval is a heck of a lot faster than define_method on both Rubinius and
# JRuby.
DISABLED_CALLBACKS.each do |method|
eval <<-EOF, nil, __FILE__, __LINE__ + 1
def #{method}(*args)
return
end
EOF
end
BLOCK_CALLBACKS.each do |method|
eval <<-EOF, nil, __FILE__, __LINE__ + 1
def #{method}(*args)
@node = super
@block.call(@node)
return
end
EOF
end
##
# @see Oga::XML::Parser#on_element
#
def on_element(*args)
@node = super
nesting << @node.name
@block.call(@node)
return
end
##
# @see Oga::XML::Parser#on_element_children
#
def after_element(*args)
nesting.pop
return
end
end # PullParser
end # XML
end # Oga