diff --git a/README.md b/README.md
index f1f500a..7d2d8c8 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,11 @@ Parsing a simple string of XML:
Oga.parse_xml('Alice')
+Parsing XML using strict mode (disables automatic tag insertion):
+
+ Oga.parse_xml('foo', :strict => true) # works fine
+ Oga.parse_xml('foo', :strict => true) # throws an error
+
Parsing a simple string of HTML:
Oga.parse_html('')
diff --git a/lib/oga/oga.rb b/lib/oga/oga.rb
index e5e3824..b7feda2 100644
--- a/lib/oga/oga.rb
+++ b/lib/oga/oga.rb
@@ -5,11 +5,12 @@ module Oga
# @example
# document = Oga.parse_xml('Hello')
#
- # @param [String|IO] xml The XML input to parse.
+ # @see [Oga::XML::Lexer#initialize]
+ #
# @return [Oga::XML::Document]
#
- def self.parse_xml(xml)
- return XML::Parser.new(xml).parse
+ def self.parse_xml(xml, options = {})
+ return XML::Parser.new(xml, options).parse
end
##
@@ -18,11 +19,12 @@ module Oga
# @example
# document = Oga.parse_html('...')
#
- # @param [String|IO] html The HTML input to parse.
+ # @see [Oga::XML::Lexer#initialize]
+ #
# @return [Oga::XML::Document]
#
- def self.parse_html(html)
- return HTML::Parser.new(html).parse
+ def self.parse_html(html, options = {})
+ return HTML::Parser.new(html, options).parse
end
##
@@ -33,11 +35,10 @@ module Oga
#
# Oga.sax_parse_html(handler, 'Hello')
#
- # @param [Object] handler The SAX handler for the parser.
- # @param [String|IO] xml The XML to parse.
+ # @see [Oga::XML::SaxParser#initialize]
#
- def self.sax_parse_xml(handler, xml)
- XML::SaxParser.new(handler, xml).parse
+ def self.sax_parse_xml(handler, xml, options = {})
+ XML::SaxParser.new(handler, xml, options).parse
end
##
@@ -48,10 +49,9 @@ module Oga
#
# Oga.sax_parse_html(handler, '')
#
- # @param [Object] handler The SAX handler for the parser.
- # @param [String|IO] html The HTML to parse.
+ # @see [Oga::XML::SaxParser#initialize]
#
- def self.sax_parse_html(handler, html)
- HTML::SaxParser.new(handler, html).parse
+ def self.sax_parse_html(handler, html, options = {})
+ HTML::SaxParser.new(handler, html, options).parse
end
end # Oga
diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb
index 7354f5c..32467e1 100644
--- a/lib/oga/xml/lexer.rb
+++ b/lib/oga/xml/lexer.rb
@@ -34,6 +34,16 @@ module Oga
# However, it is perfectly save to use different instances per thread.
# There is no _global_ state used by this lexer.
#
+ # ## Strict Mode
+ #
+ # By default the lexer is rather permissive regarding the input. For
+ # example, missing closing tags are inserted by default. To disable this
+ # behaviour the lexer can be run in "strict mode" by setting `:strict` to
+ # `true`:
+ #
+ # lexer = Oga::XML::Lexer.new('...', :strict => true)
+ #
+ # Strict mode only applies to XML documents.
#
# @private
#
@@ -97,13 +107,17 @@ module Oga
#
# @param [Hash] options
#
- # @option options [Symbol] :html When set to `true` the lexer will treat
- # the input as HTML instead of SGML/XML. This makes it possible to lex
- # HTML void elements such as ``.
+ # @option options [TrueClass|FalseClass] :html When set to `true` the
+ # lexer will treat the input as HTML instead of XML. This makes it
+ # possible to lex HTML void elements such as ``.
+ #
+ # @option options [TrueClass|FalseClass] :strict Enables/disables strict
+ # parsing of XML documents, disabled by default.
#
def initialize(data, options = {})
- @data = data
- @html = options[:html]
+ @data = data
+ @html = options[:html]
+ @strict = options[:strict] || false
reset
end
@@ -191,7 +205,7 @@ module Oga
end
# Add any missing closing tags
- unless @elements.empty?
+ if !strict? and !@elements.empty?
@elements.length.times { on_element_end }
end
ensure
@@ -205,6 +219,13 @@ module Oga
return @html == true
end
+ ##
+ # @return [TrueClass|FalseClass]
+ #
+ def strict?
+ return @strict
+ end
+
##
# @return [TrueClass|FalseClass]
#
diff --git a/spec/oga/oga_spec.rb b/spec/oga/oga_spec.rb
index 4e140fa..067904e 100644
--- a/spec/oga/oga_spec.rb
+++ b/spec/oga/oga_spec.rb
@@ -1,16 +1,26 @@
require 'spec_helper'
describe Oga do
- it 'parses an XML document' do
- document = described_class.parse_xml('foo')
+ describe 'parsing XML' do
+ it 'parses an XML document' do
+ document = described_class.parse_xml('foo')
- document.is_a?(Oga::XML::Document).should == true
+ document.is_a?(Oga::XML::Document).should == true
+ end
+
+ it 'raises an error when parsing an invalid document in strict mode' do
+ block = proc { described_class.parse_xml('foo', :strict => true) }
+
+ block.should raise_error(LL::ParserError)
+ end
end
- it 'parses an HTML document' do
- document = described_class.parse_xml('')
+ describe 'parsing HTML' do
+ it 'parses an HTML document' do
+ document = described_class.parse_xml('')
- document.is_a?(Oga::XML::Document).should == true
+ document.is_a?(Oga::XML::Document).should == true
+ end
end
describe 'SAX parsing' do
@@ -27,13 +37,19 @@ describe Oga do
end
it 'parses an XML document using the SAX parser' do
- Oga.sax_parse_xml(@handler, '')
+ described_class.sax_parse_xml(@handler, '')
@handler.name.should == 'foo'
end
+ it 'raises an error when parsing an invalid XML document in strict mode' do
+ block = proc { Oga.sax_parse_xml(@handler, '', :strict => true) }
+
+ block.should raise_error(LL::ParserError)
+ end
+
it 'parses an HTML document using the SAX parser' do
- Oga.sax_parse_html(@handler, '')
+ described_class.sax_parse_html(@handler, '')
@handler.name.should == 'link'
end
diff --git a/spec/oga/xml/lexer/strict_spec.rb b/spec/oga/xml/lexer/strict_spec.rb
new file mode 100644
index 0000000..7b840fc
--- /dev/null
+++ b/spec/oga/xml/lexer/strict_spec.rb
@@ -0,0 +1,12 @@
+require 'spec_helper'
+
+describe Oga::XML::Lexer do
+ describe 'lexing XML using strict mode' do
+ it 'does not automatically insert missing closing tags' do
+ lex('bar', :strict => true).should == [
+ [:T_ELEM_NAME, 'foo', 1],
+ [:T_TEXT, 'bar', 1]
+ ]
+ end
+ end
+end