From 2c18a51ba905d46469170af7f071b068abe965eb Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Mon, 15 Jun 2015 23:53:11 +0200 Subject: [PATCH] Support for strict parsing of XML documents Currently this only disabled the automatic insertion of closing tags, in the future this may also disable other features if deemed worth the effort. Fixes #107 --- README.md | 5 +++++ lib/oga/oga.rb | 28 +++++++++++++------------- lib/oga/xml/lexer.rb | 33 +++++++++++++++++++++++++------ spec/oga/oga_spec.rb | 32 ++++++++++++++++++++++-------- spec/oga/xml/lexer/strict_spec.rb | 12 +++++++++++ 5 files changed, 82 insertions(+), 28 deletions(-) create mode 100644 spec/oga/xml/lexer/strict_spec.rb diff --git a/README.md b/README.md index f1f500a..7d2d8c8 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,11 @@ Parsing a simple string of XML: Oga.parse_xml('Alice') +Parsing XML using strict mode (disables automatic tag insertion): + + Oga.parse_xml('foo', :strict => true) # works fine + Oga.parse_xml('foo', :strict => true) # throws an error + Parsing a simple string of HTML: Oga.parse_html('') diff --git a/lib/oga/oga.rb b/lib/oga/oga.rb index e5e3824..b7feda2 100644 --- a/lib/oga/oga.rb +++ b/lib/oga/oga.rb @@ -5,11 +5,12 @@ module Oga # @example # document = Oga.parse_xml('Hello') # - # @param [String|IO] xml The XML input to parse. + # @see [Oga::XML::Lexer#initialize] + # # @return [Oga::XML::Document] # - def self.parse_xml(xml) - return XML::Parser.new(xml).parse + def self.parse_xml(xml, options = {}) + return XML::Parser.new(xml, options).parse end ## @@ -18,11 +19,12 @@ module Oga # @example # document = Oga.parse_html('...') # - # @param [String|IO] html The HTML input to parse. + # @see [Oga::XML::Lexer#initialize] + # # @return [Oga::XML::Document] # - def self.parse_html(html) - return HTML::Parser.new(html).parse + def self.parse_html(html, options = {}) + return HTML::Parser.new(html, options).parse end ## @@ -33,11 +35,10 @@ module Oga # # Oga.sax_parse_html(handler, 'Hello') # - # @param [Object] handler The SAX handler for the parser. - # @param [String|IO] xml The XML to parse. + # @see [Oga::XML::SaxParser#initialize] # - def self.sax_parse_xml(handler, xml) - XML::SaxParser.new(handler, xml).parse + def self.sax_parse_xml(handler, xml, options = {}) + XML::SaxParser.new(handler, xml, options).parse end ## @@ -48,10 +49,9 @@ module Oga # # Oga.sax_parse_html(handler, '') # - # @param [Object] handler The SAX handler for the parser. - # @param [String|IO] html The HTML to parse. + # @see [Oga::XML::SaxParser#initialize] # - def self.sax_parse_html(handler, html) - HTML::SaxParser.new(handler, html).parse + def self.sax_parse_html(handler, html, options = {}) + HTML::SaxParser.new(handler, html, options).parse end end # Oga diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb index 7354f5c..32467e1 100644 --- a/lib/oga/xml/lexer.rb +++ b/lib/oga/xml/lexer.rb @@ -34,6 +34,16 @@ module Oga # However, it is perfectly save to use different instances per thread. # There is no _global_ state used by this lexer. # + # ## Strict Mode + # + # By default the lexer is rather permissive regarding the input. For + # example, missing closing tags are inserted by default. To disable this + # behaviour the lexer can be run in "strict mode" by setting `:strict` to + # `true`: + # + # lexer = Oga::XML::Lexer.new('...', :strict => true) + # + # Strict mode only applies to XML documents. # # @private # @@ -97,13 +107,17 @@ module Oga # # @param [Hash] options # - # @option options [Symbol] :html When set to `true` the lexer will treat - # the input as HTML instead of SGML/XML. This makes it possible to lex - # HTML void elements such as ``. + # @option options [TrueClass|FalseClass] :html When set to `true` the + # lexer will treat the input as HTML instead of XML. This makes it + # possible to lex HTML void elements such as ``. + # + # @option options [TrueClass|FalseClass] :strict Enables/disables strict + # parsing of XML documents, disabled by default. # def initialize(data, options = {}) - @data = data - @html = options[:html] + @data = data + @html = options[:html] + @strict = options[:strict] || false reset end @@ -191,7 +205,7 @@ module Oga end # Add any missing closing tags - unless @elements.empty? + if !strict? and !@elements.empty? @elements.length.times { on_element_end } end ensure @@ -205,6 +219,13 @@ module Oga return @html == true end + ## + # @return [TrueClass|FalseClass] + # + def strict? + return @strict + end + ## # @return [TrueClass|FalseClass] # diff --git a/spec/oga/oga_spec.rb b/spec/oga/oga_spec.rb index 4e140fa..067904e 100644 --- a/spec/oga/oga_spec.rb +++ b/spec/oga/oga_spec.rb @@ -1,16 +1,26 @@ require 'spec_helper' describe Oga do - it 'parses an XML document' do - document = described_class.parse_xml('foo') + describe 'parsing XML' do + it 'parses an XML document' do + document = described_class.parse_xml('foo') - document.is_a?(Oga::XML::Document).should == true + document.is_a?(Oga::XML::Document).should == true + end + + it 'raises an error when parsing an invalid document in strict mode' do + block = proc { described_class.parse_xml('foo', :strict => true) } + + block.should raise_error(LL::ParserError) + end end - it 'parses an HTML document' do - document = described_class.parse_xml('') + describe 'parsing HTML' do + it 'parses an HTML document' do + document = described_class.parse_xml('') - document.is_a?(Oga::XML::Document).should == true + document.is_a?(Oga::XML::Document).should == true + end end describe 'SAX parsing' do @@ -27,13 +37,19 @@ describe Oga do end it 'parses an XML document using the SAX parser' do - Oga.sax_parse_xml(@handler, '') + described_class.sax_parse_xml(@handler, '') @handler.name.should == 'foo' end + it 'raises an error when parsing an invalid XML document in strict mode' do + block = proc { Oga.sax_parse_xml(@handler, '', :strict => true) } + + block.should raise_error(LL::ParserError) + end + it 'parses an HTML document using the SAX parser' do - Oga.sax_parse_html(@handler, '') + described_class.sax_parse_html(@handler, '') @handler.name.should == 'link' end diff --git a/spec/oga/xml/lexer/strict_spec.rb b/spec/oga/xml/lexer/strict_spec.rb new file mode 100644 index 0000000..7b840fc --- /dev/null +++ b/spec/oga/xml/lexer/strict_spec.rb @@ -0,0 +1,12 @@ +require 'spec_helper' + +describe Oga::XML::Lexer do + describe 'lexing XML using strict mode' do + it 'does not automatically insert missing closing tags' do + lex('bar', :strict => true).should == [ + [:T_ELEM_NAME, 'foo', 1], + [:T_TEXT, 'bar', 1] + ] + end + end +end