Basic pull parsing setup.
This parser extends the regular DOM parser but instead delegates certain nodes to a block instead of building a DOM tree. The API is a bit raw in its current form but I'll extend it and make it a bit more user friendly in the following commits. In particular I want to make it easier to figure out if a certain node is nested inside another node.
This commit is contained in:
parent
fd5bbbc9a2
commit
030a0068bd
|
@ -2,6 +2,8 @@ require 'set'
|
|||
|
||||
require_relative 'oga/xml/lexer'
|
||||
require_relative 'oga/xml/parser'
|
||||
require_relative 'oga/xml/pull_parser'
|
||||
|
||||
require_relative 'oga/xml/node'
|
||||
require_relative 'oga/xml/element'
|
||||
require_relative 'oga/xml/document'
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
module Oga
|
||||
module XML
|
||||
##
|
||||
# The PullParser class can be used to parse an XML document incrementally
|
||||
# instead of parsing it as a whole. This results in lower memory usage and
|
||||
# potentially faster parsing times. The downside is that pull parsers are
|
||||
# typically more difficult to use compared to DOM parsers.
|
||||
#
|
||||
# Basic parsing using this class works as following:
|
||||
#
|
||||
# parser = Oga::XML::PullParser.new('... xml here ...')
|
||||
#
|
||||
# parser.parse do |node|
|
||||
# if node.is_a?(Oga::XML::PullParser)
|
||||
#
|
||||
# end
|
||||
# end
|
||||
#
|
||||
# This parses yields proper XML instances such as {Oga::XML::Element}.
|
||||
# Doctypes and XML declarations are ignored by this parser.
|
||||
#
|
||||
class PullParser < Parser
|
||||
##
|
||||
# @return [Array]
|
||||
#
|
||||
DISABLED_CALLBACKS = [
|
||||
:on_document,
|
||||
:on_doctype,
|
||||
:on_xml_decl,
|
||||
:on_element_children
|
||||
]
|
||||
|
||||
##
|
||||
# @return [Array]
|
||||
#
|
||||
BLOCK_CALLBACKS = [
|
||||
:on_cdata,
|
||||
:on_comment,
|
||||
:on_text,
|
||||
:on_element
|
||||
]
|
||||
|
||||
##
|
||||
# @see Oga::XML::Parser#reset
|
||||
#
|
||||
def reset
|
||||
super
|
||||
|
||||
@block = nil
|
||||
end
|
||||
|
||||
##
|
||||
# Parses the input and yields every node to the supplied block.
|
||||
#
|
||||
# @yieldparam [Oga::XML::Node]
|
||||
#
|
||||
def parse(&block)
|
||||
@block = block
|
||||
|
||||
yyparse(self, :yield_next_token)
|
||||
|
||||
reset
|
||||
|
||||
return
|
||||
end
|
||||
|
||||
# eval is a heck of a lot faster than define_method on both Rubinius and
|
||||
# JRuby.
|
||||
DISABLED_CALLBACKS.each do |method|
|
||||
eval <<-EOF, nil, __FILE__, __LINE__ + 1
|
||||
def #{method}(*_)
|
||||
return
|
||||
end
|
||||
EOF
|
||||
end
|
||||
|
||||
BLOCK_CALLBACKS.each do |method|
|
||||
eval <<-EOF, nil, __FILE__, __LINE__ + 1
|
||||
def #{method}(*args)
|
||||
@block.call(super)
|
||||
end
|
||||
EOF
|
||||
end
|
||||
end # PullParser
|
||||
end # XML
|
||||
end # Oga
|
|
@ -0,0 +1,19 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::XML::PullParser do
|
||||
context 'doctypes' do
|
||||
before :all do
|
||||
@parser = described_class.new('<!DOCTYPE html>')
|
||||
end
|
||||
|
||||
example 'ignore doctypes' do
|
||||
amount = 0
|
||||
|
||||
@parser.parse do
|
||||
amount += 1
|
||||
end
|
||||
|
||||
amount.should == 0
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,29 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::XML::PullParser do
|
||||
context 'elements' do
|
||||
before :all do
|
||||
@parser = described_class.new('<person>Alice</person>')
|
||||
end
|
||||
|
||||
example 'parse an element' do
|
||||
name = nil
|
||||
|
||||
@parser.parse do |node|
|
||||
name = node.name if node.is_a?(Oga::XML::Element)
|
||||
end
|
||||
|
||||
name.should == 'person'
|
||||
end
|
||||
|
||||
example 'parse the text of an element' do
|
||||
text = nil
|
||||
|
||||
@parser.parse do |node|
|
||||
text = node.text if node.is_a?(Oga::XML::Text)
|
||||
end
|
||||
|
||||
text.should == 'Alice'
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue