Implemented a basic SAX API.

This API is a little bit dodgy (similar to Nokogiri's API) due to the use of
separate parser and handler classes. This is done to ensure that the return
values of callback methods (e.g. on_element) aren't used by Racc for building
AST trees. This also ensures that whatever variables are set by the handler
don't conflict with any variables of the parser.

This fixes #42.
This commit is contained in:
Yorick Peterse 2014-09-16 14:30:46 +02:00
parent 9e935e5d24
commit 317b49bcf6
7 changed files with 202 additions and 9 deletions

View File

@ -37,8 +37,11 @@ require_relative 'oga/xml/attribute'
require_relative 'oga/xml/element' require_relative 'oga/xml/element'
require_relative 'oga/xml/node_set' require_relative 'oga/xml/node_set'
require_relative 'oga/xml/sax_parser'
require_relative 'oga/xml/pull_parser' require_relative 'oga/xml/pull_parser'
require_relative 'oga/html/parser' require_relative 'oga/html/parser'
require_relative 'oga/html/sax_parser'
require_relative 'oga/xpath/node' require_relative 'oga/xpath/node'
require_relative 'oga/xpath/lexer' require_relative 'oga/xpath/lexer'

View File

@ -0,0 +1,18 @@
module Oga
module HTML
##
# SAX parser for HTML documents. See the documentation of
# {Oga::XML::SaxParser} for more information.
#
class SaxParser < XML::SaxParser
##
# @see [Oga::XML::SaxParser#initialize]
#
def initialize(handler, data, options = {})
options = options.merge(:html => true)
super(handler, data, options)
end
end # SaxParser
end # HTML
end # Oga

View File

@ -24,4 +24,34 @@ module Oga
def self.parse_html(html) def self.parse_html(html)
return HTML::Parser.new(html).parse return HTML::Parser.new(html).parse
end end
##
# Parses the given XML document using the SAX parser.
#
# @example
# handler = SomeSaxHandler.new
#
# Oga.sax_parse_html(handler, '<root>Hello</root>')
#
# @param [Object] handler The SAX handler for the parser.
# @param [String|IO] xml The XML to parse.
#
def self.sax_parse_xml(handler, xml)
XML::SaxParser.new(handler, xml).parse
end
##
# Parses the given HTML document using the SAX parser.
#
# @example
# handler = SomeSaxHandler.new
#
# Oga.sax_parse_html(handler, '<script>foo()</script>')
#
# @param [Object] handler The SAX handler for the parser.
# @param [String|IO] HTML The HTML to parse.
#
def self.sax_parse_html(handler, html)
HTML::SaxParser.new(handler, html).parse
end
end # Oga end # Oga

63
lib/oga/xml/sax_parser.rb Normal file
View File

@ -0,0 +1,63 @@
module Oga
module XML
##
# The SaxParser class provides the basic interface for writing custom SAX
# parsers. All callback methods defined in {Oga::XML::Parser} are delegated
# to a dedicated handler class.
#
# To write a custom handler for the SAX parser, create a class that
# implements one (or many) of the following callback methods:
#
# * `on_document`
# * `on_doctype`
# * `on_cdata`
# * `on_comment`
# * `on_proc_ins`
# * `on_xml_decl`
# * `on_text`
# * `on_element`
# * `on_element_children`
# * `after_element`
#
# For example:
#
# class SaxHandler
# def on_element(namespace, name, attrs = {})
# puts name
# end
# end
#
# You can then use it as following:
#
# handler = SaxHandler.new
# parser = Oga::XML::SaxParser.new(handler, '<foo />')
#
# parser.parse
#
# For information on the callback arguments see the documentation of the
# corresponding methods in {Oga::XML::Parser}.
#
class SaxParser < Parser
##
# @param [Object] handler The SAX handler to delegate callbacks to.
# @see [Oga::XML::Parser#initialize]
#
def initialize(handler, *args)
@handler = handler
super(*args)
end
# Delegate all callbacks to the handler object.
instance_methods.grep(/^(on_|after_)/).each do |method|
eval <<-EOF, nil, __FILE__, __LINE__ + 1
def #{method}(*args)
@handler.#{method}(*args) if @handler.respond_to?(:#{method})
return
end
EOF
end
end # SaxParser
end # XML
end # Oga

View File

@ -0,0 +1,22 @@
require 'spec_helper'
describe Oga::HTML::SaxParser do
before do
@handler = Class.new do
attr_reader :name
def on_element(namespace, name, attrs = {})
@name = name
end
end
end
example 'use custom callback methods if defined' do
handler = @handler.new
parser = described_class.new(handler, '<link>')
parser.parse
handler.name.should == 'link'
end
end

View File

@ -1,19 +1,41 @@
require 'spec_helper' require 'spec_helper'
describe Oga do describe Oga do
context 'parse_xml' do example 'parse an XML document' do
example 'parse an XML document' do document = described_class.parse_xml('<root>foo</root>')
document = described_class.parse_xml('<root>foo</root>')
document.is_a?(Oga::XML::Document).should == true document.is_a?(Oga::XML::Document).should == true
end
end end
context 'parse_html' do example 'parse an HTML document' do
example 'parse an HTML document' do document = described_class.parse_xml('<html><body></body></html>')
document = described_class.parse_xml('<html><body></body></html>')
document.is_a?(Oga::XML::Document).should == true document.is_a?(Oga::XML::Document).should == true
end
context 'SAX parsing' do
before do
klass = Class.new do
attr_reader :name
def on_element(namespace, name, attrs = {})
@name = name
end
end
@handler = klass.new
end
example 'parse an XML document using the SAX parser' do
Oga.sax_parse_xml(@handler, '<foo />')
@handler.name.should == 'foo'
end
example 'parse an HTML document using the SAX parser' do
Oga.sax_parse_xml(@handler, '<link>')
@handler.name.should == 'link'
end end
end end
end end

View File

@ -0,0 +1,35 @@
require 'spec_helper'
describe Oga::XML::SaxParser do
before do
@handler = Class.new do
attr_reader :name
def on_element(namespace, name, attrs = {})
@name = name
end
end
end
example 'ignore return values of callback methods' do
parser = described_class.new(@handler.new, 'foo')
parser.parse.should be_nil
end
example 'use custom callback methods if defined' do
handler = @handler.new
parser = described_class.new(handler, '<foo />')
parser.parse
handler.name.should == 'foo'
end
example 'ignore callbacks that are not defined in the handler' do
parser = described_class.new(@handler.new, '<!--foo-->')
# This would raise if undefined callbacks were _not_ ignored.
lambda { parser.parse }.should_not raise_error
end
end