diff --git a/lib/oga.rb b/lib/oga.rb index 75cf725..5c18861 100644 --- a/lib/oga.rb +++ b/lib/oga.rb @@ -37,8 +37,11 @@ require_relative 'oga/xml/attribute' require_relative 'oga/xml/element' require_relative 'oga/xml/node_set' +require_relative 'oga/xml/sax_parser' require_relative 'oga/xml/pull_parser' + require_relative 'oga/html/parser' +require_relative 'oga/html/sax_parser' require_relative 'oga/xpath/node' require_relative 'oga/xpath/lexer' diff --git a/lib/oga/html/sax_parser.rb b/lib/oga/html/sax_parser.rb new file mode 100644 index 0000000..07f7731 --- /dev/null +++ b/lib/oga/html/sax_parser.rb @@ -0,0 +1,18 @@ +module Oga + module HTML + ## + # SAX parser for HTML documents. See the documentation of + # {Oga::XML::SaxParser} for more information. + # + class SaxParser < XML::SaxParser + ## + # @see [Oga::XML::SaxParser#initialize] + # + def initialize(handler, data, options = {}) + options = options.merge(:html => true) + + super(handler, data, options) + end + end # SaxParser + end # HTML +end # Oga diff --git a/lib/oga/oga.rb b/lib/oga/oga.rb index 7f40378..8cf78ca 100644 --- a/lib/oga/oga.rb +++ b/lib/oga/oga.rb @@ -24,4 +24,34 @@ module Oga def self.parse_html(html) return HTML::Parser.new(html).parse end + + ## + # Parses the given XML document using the SAX parser. + # + # @example + # handler = SomeSaxHandler.new + # + # Oga.sax_parse_html(handler, 'Hello') + # + # @param [Object] handler The SAX handler for the parser. + # @param [String|IO] xml The XML to parse. + # + def self.sax_parse_xml(handler, xml) + XML::SaxParser.new(handler, xml).parse + end + + ## + # Parses the given HTML document using the SAX parser. + # + # @example + # handler = SomeSaxHandler.new + # + # Oga.sax_parse_html(handler, '') + # + # @param [Object] handler The SAX handler for the parser. + # @param [String|IO] HTML The HTML to parse. + # + def self.sax_parse_html(handler, html) + HTML::SaxParser.new(handler, html).parse + end end # Oga diff --git a/lib/oga/xml/sax_parser.rb b/lib/oga/xml/sax_parser.rb new file mode 100644 index 0000000..47a9574 --- /dev/null +++ b/lib/oga/xml/sax_parser.rb @@ -0,0 +1,63 @@ +module Oga + module XML + ## + # The SaxParser class provides the basic interface for writing custom SAX + # parsers. All callback methods defined in {Oga::XML::Parser} are delegated + # to a dedicated handler class. + # + # To write a custom handler for the SAX parser, create a class that + # implements one (or many) of the following callback methods: + # + # * `on_document` + # * `on_doctype` + # * `on_cdata` + # * `on_comment` + # * `on_proc_ins` + # * `on_xml_decl` + # * `on_text` + # * `on_element` + # * `on_element_children` + # * `after_element` + # + # For example: + # + # class SaxHandler + # def on_element(namespace, name, attrs = {}) + # puts name + # end + # end + # + # You can then use it as following: + # + # handler = SaxHandler.new + # parser = Oga::XML::SaxParser.new(handler, '') + # + # parser.parse + # + # For information on the callback arguments see the documentation of the + # corresponding methods in {Oga::XML::Parser}. + # + class SaxParser < Parser + ## + # @param [Object] handler The SAX handler to delegate callbacks to. + # @see [Oga::XML::Parser#initialize] + # + def initialize(handler, *args) + @handler = handler + + super(*args) + end + + # Delegate all callbacks to the handler object. + instance_methods.grep(/^(on_|after_)/).each do |method| + eval <<-EOF, nil, __FILE__, __LINE__ + 1 + def #{method}(*args) + @handler.#{method}(*args) if @handler.respond_to?(:#{method}) + + return + end + EOF + end + end # SaxParser + end # XML +end # Oga diff --git a/spec/oga/html/sax_parser_spec.rb b/spec/oga/html/sax_parser_spec.rb new file mode 100644 index 0000000..c4c775c --- /dev/null +++ b/spec/oga/html/sax_parser_spec.rb @@ -0,0 +1,22 @@ +require 'spec_helper' + +describe Oga::HTML::SaxParser do + before do + @handler = Class.new do + attr_reader :name + + def on_element(namespace, name, attrs = {}) + @name = name + end + end + end + + example 'use custom callback methods if defined' do + handler = @handler.new + parser = described_class.new(handler, '') + + parser.parse + + handler.name.should == 'link' + end +end diff --git a/spec/oga/oga_spec.rb b/spec/oga/oga_spec.rb index 7f42554..569a788 100644 --- a/spec/oga/oga_spec.rb +++ b/spec/oga/oga_spec.rb @@ -1,19 +1,41 @@ require 'spec_helper' describe Oga do - context 'parse_xml' do - example 'parse an XML document' do - document = described_class.parse_xml('foo') + example 'parse an XML document' do + document = described_class.parse_xml('foo') - document.is_a?(Oga::XML::Document).should == true - end + document.is_a?(Oga::XML::Document).should == true end - context 'parse_html' do - example 'parse an HTML document' do - document = described_class.parse_xml('') + example 'parse an HTML document' do + document = described_class.parse_xml('') - document.is_a?(Oga::XML::Document).should == true + document.is_a?(Oga::XML::Document).should == true + end + + context 'SAX parsing' do + before do + klass = Class.new do + attr_reader :name + + def on_element(namespace, name, attrs = {}) + @name = name + end + end + + @handler = klass.new + end + + example 'parse an XML document using the SAX parser' do + Oga.sax_parse_xml(@handler, '') + + @handler.name.should == 'foo' + end + + example 'parse an HTML document using the SAX parser' do + Oga.sax_parse_xml(@handler, '') + + @handler.name.should == 'link' end end end diff --git a/spec/oga/xml/sax_parser_spec.rb b/spec/oga/xml/sax_parser_spec.rb new file mode 100644 index 0000000..03686d0 --- /dev/null +++ b/spec/oga/xml/sax_parser_spec.rb @@ -0,0 +1,35 @@ +require 'spec_helper' + +describe Oga::XML::SaxParser do + before do + @handler = Class.new do + attr_reader :name + + def on_element(namespace, name, attrs = {}) + @name = name + end + end + end + + example 'ignore return values of callback methods' do + parser = described_class.new(@handler.new, 'foo') + + parser.parse.should be_nil + end + + example 'use custom callback methods if defined' do + handler = @handler.new + parser = described_class.new(handler, '') + + parser.parse + + handler.name.should == 'foo' + end + + example 'ignore callbacks that are not defined in the handler' do + parser = described_class.new(@handler.new, '') + + # This would raise if undefined callbacks were _not_ ignored. + lambda { parser.parse }.should_not raise_error + end +end