diff --git a/README.md b/README.md index f1f500a..7d2d8c8 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,11 @@ Parsing a simple string of XML: Oga.parse_xml('Alice') +Parsing XML using strict mode (disables automatic tag insertion): + + Oga.parse_xml('foo', :strict => true) # works fine + Oga.parse_xml('foo', :strict => true) # throws an error + Parsing a simple string of HTML: Oga.parse_html('') diff --git a/lib/oga/oga.rb b/lib/oga/oga.rb index e5e3824..b7feda2 100644 --- a/lib/oga/oga.rb +++ b/lib/oga/oga.rb @@ -5,11 +5,12 @@ module Oga # @example # document = Oga.parse_xml('Hello') # - # @param [String|IO] xml The XML input to parse. + # @see [Oga::XML::Lexer#initialize] + # # @return [Oga::XML::Document] # - def self.parse_xml(xml) - return XML::Parser.new(xml).parse + def self.parse_xml(xml, options = {}) + return XML::Parser.new(xml, options).parse end ## @@ -18,11 +19,12 @@ module Oga # @example # document = Oga.parse_html('...') # - # @param [String|IO] html The HTML input to parse. + # @see [Oga::XML::Lexer#initialize] + # # @return [Oga::XML::Document] # - def self.parse_html(html) - return HTML::Parser.new(html).parse + def self.parse_html(html, options = {}) + return HTML::Parser.new(html, options).parse end ## @@ -33,11 +35,10 @@ module Oga # # Oga.sax_parse_html(handler, 'Hello') # - # @param [Object] handler The SAX handler for the parser. - # @param [String|IO] xml The XML to parse. + # @see [Oga::XML::SaxParser#initialize] # - def self.sax_parse_xml(handler, xml) - XML::SaxParser.new(handler, xml).parse + def self.sax_parse_xml(handler, xml, options = {}) + XML::SaxParser.new(handler, xml, options).parse end ## @@ -48,10 +49,9 @@ module Oga # # Oga.sax_parse_html(handler, '') # - # @param [Object] handler The SAX handler for the parser. - # @param [String|IO] html The HTML to parse. + # @see [Oga::XML::SaxParser#initialize] # - def self.sax_parse_html(handler, html) - HTML::SaxParser.new(handler, html).parse + def self.sax_parse_html(handler, html, options = {}) + HTML::SaxParser.new(handler, html, options).parse end end # Oga diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb index 7354f5c..32467e1 100644 --- a/lib/oga/xml/lexer.rb +++ b/lib/oga/xml/lexer.rb @@ -34,6 +34,16 @@ module Oga # However, it is perfectly save to use different instances per thread. # There is no _global_ state used by this lexer. # + # ## Strict Mode + # + # By default the lexer is rather permissive regarding the input. For + # example, missing closing tags are inserted by default. To disable this + # behaviour the lexer can be run in "strict mode" by setting `:strict` to + # `true`: + # + # lexer = Oga::XML::Lexer.new('...', :strict => true) + # + # Strict mode only applies to XML documents. # # @private # @@ -97,13 +107,17 @@ module Oga # # @param [Hash] options # - # @option options [Symbol] :html When set to `true` the lexer will treat - # the input as HTML instead of SGML/XML. This makes it possible to lex - # HTML void elements such as ``. + # @option options [TrueClass|FalseClass] :html When set to `true` the + # lexer will treat the input as HTML instead of XML. This makes it + # possible to lex HTML void elements such as ``. + # + # @option options [TrueClass|FalseClass] :strict Enables/disables strict + # parsing of XML documents, disabled by default. # def initialize(data, options = {}) - @data = data - @html = options[:html] + @data = data + @html = options[:html] + @strict = options[:strict] || false reset end @@ -191,7 +205,7 @@ module Oga end # Add any missing closing tags - unless @elements.empty? + if !strict? and !@elements.empty? @elements.length.times { on_element_end } end ensure @@ -205,6 +219,13 @@ module Oga return @html == true end + ## + # @return [TrueClass|FalseClass] + # + def strict? + return @strict + end + ## # @return [TrueClass|FalseClass] # diff --git a/spec/oga/oga_spec.rb b/spec/oga/oga_spec.rb index 4e140fa..067904e 100644 --- a/spec/oga/oga_spec.rb +++ b/spec/oga/oga_spec.rb @@ -1,16 +1,26 @@ require 'spec_helper' describe Oga do - it 'parses an XML document' do - document = described_class.parse_xml('foo') + describe 'parsing XML' do + it 'parses an XML document' do + document = described_class.parse_xml('foo') - document.is_a?(Oga::XML::Document).should == true + document.is_a?(Oga::XML::Document).should == true + end + + it 'raises an error when parsing an invalid document in strict mode' do + block = proc { described_class.parse_xml('foo', :strict => true) } + + block.should raise_error(LL::ParserError) + end end - it 'parses an HTML document' do - document = described_class.parse_xml('') + describe 'parsing HTML' do + it 'parses an HTML document' do + document = described_class.parse_xml('') - document.is_a?(Oga::XML::Document).should == true + document.is_a?(Oga::XML::Document).should == true + end end describe 'SAX parsing' do @@ -27,13 +37,19 @@ describe Oga do end it 'parses an XML document using the SAX parser' do - Oga.sax_parse_xml(@handler, '') + described_class.sax_parse_xml(@handler, '') @handler.name.should == 'foo' end + it 'raises an error when parsing an invalid XML document in strict mode' do + block = proc { Oga.sax_parse_xml(@handler, '', :strict => true) } + + block.should raise_error(LL::ParserError) + end + it 'parses an HTML document using the SAX parser' do - Oga.sax_parse_html(@handler, '') + described_class.sax_parse_html(@handler, '') @handler.name.should == 'link' end diff --git a/spec/oga/xml/lexer/strict_spec.rb b/spec/oga/xml/lexer/strict_spec.rb new file mode 100644 index 0000000..7b840fc --- /dev/null +++ b/spec/oga/xml/lexer/strict_spec.rb @@ -0,0 +1,12 @@ +require 'spec_helper' + +describe Oga::XML::Lexer do + describe 'lexing XML using strict mode' do + it 'does not automatically insert missing closing tags' do + lex('bar', :strict => true).should == [ + [:T_ELEM_NAME, 'foo', 1], + [:T_TEXT, 'bar', 1] + ] + end + end +end