Basic pull parsing setup.

This parser extends the regular DOM parser but instead delegates certain nodes
to a block instead of building a DOM tree.

The API is a bit raw in its current form but I'll extend it and make it a bit
more user friendly in the following commits. In particular I want to make it
easier to figure out if a certain node is nested inside another node.
This commit is contained in:
Yorick Peterse 2014-04-28 17:22:17 +02:00
parent fd5bbbc9a2
commit 030a0068bd
4 changed files with 136 additions and 0 deletions

View File

@ -2,6 +2,8 @@ require 'set'
require_relative 'oga/xml/lexer'
require_relative 'oga/xml/parser'
require_relative 'oga/xml/pull_parser'
require_relative 'oga/xml/node'
require_relative 'oga/xml/element'
require_relative 'oga/xml/document'

View File

@ -0,0 +1,86 @@
module Oga
module XML
##
# The PullParser class can be used to parse an XML document incrementally
# instead of parsing it as a whole. This results in lower memory usage and
# potentially faster parsing times. The downside is that pull parsers are
# typically more difficult to use compared to DOM parsers.
#
# Basic parsing using this class works as following:
#
# parser = Oga::XML::PullParser.new('... xml here ...')
#
# parser.parse do |node|
# if node.is_a?(Oga::XML::PullParser)
#
# end
# end
#
# This parses yields proper XML instances such as {Oga::XML::Element}.
# Doctypes and XML declarations are ignored by this parser.
#
class PullParser < Parser
##
# @return [Array]
#
DISABLED_CALLBACKS = [
:on_document,
:on_doctype,
:on_xml_decl,
:on_element_children
]
##
# @return [Array]
#
BLOCK_CALLBACKS = [
:on_cdata,
:on_comment,
:on_text,
:on_element
]
##
# @see Oga::XML::Parser#reset
#
def reset
super
@block = nil
end
##
# Parses the input and yields every node to the supplied block.
#
# @yieldparam [Oga::XML::Node]
#
def parse(&block)
@block = block
yyparse(self, :yield_next_token)
reset
return
end
# eval is a heck of a lot faster than define_method on both Rubinius and
# JRuby.
DISABLED_CALLBACKS.each do |method|
eval <<-EOF, nil, __FILE__, __LINE__ + 1
def #{method}(*_)
return
end
EOF
end
BLOCK_CALLBACKS.each do |method|
eval <<-EOF, nil, __FILE__, __LINE__ + 1
def #{method}(*args)
@block.call(super)
end
EOF
end
end # PullParser
end # XML
end # Oga

View File

@ -0,0 +1,19 @@
require 'spec_helper'
describe Oga::XML::PullParser do
context 'doctypes' do
before :all do
@parser = described_class.new('<!DOCTYPE html>')
end
example 'ignore doctypes' do
amount = 0
@parser.parse do
amount += 1
end
amount.should == 0
end
end
end

View File

@ -0,0 +1,29 @@
require 'spec_helper'
describe Oga::XML::PullParser do
context 'elements' do
before :all do
@parser = described_class.new('<person>Alice</person>')
end
example 'parse an element' do
name = nil
@parser.parse do |node|
name = node.name if node.is_a?(Oga::XML::Element)
end
name.should == 'person'
end
example 'parse the text of an element' do
text = nil
@parser.parse do |node|
text = node.text if node.is_a?(Oga::XML::Text)
end
text.should == 'Alice'
end
end
end