Support for strict parsing of XML documents

Currently this only disabled the automatic insertion of closing tags, in
the future this may also disable other features if deemed worth the
effort.

Fixes #107
This commit is contained in:
Yorick Peterse 2015-06-15 23:53:11 +02:00
parent 4031c4f843
commit 2c18a51ba9
5 changed files with 82 additions and 28 deletions

View File

@ -28,6 +28,11 @@ Parsing a simple string of XML:
Oga.parse_xml('<people><person>Alice</person></people>') Oga.parse_xml('<people><person>Alice</person></people>')
Parsing XML using strict mode (disables automatic tag insertion):
Oga.parse_xml('<people>foo</people>', :strict => true) # works fine
Oga.parse_xml('<people>foo', :strict => true) # throws an error
Parsing a simple string of HTML: Parsing a simple string of HTML:
Oga.parse_html('<link rel="stylesheet" href="foo.css">') Oga.parse_html('<link rel="stylesheet" href="foo.css">')

View File

@ -5,11 +5,12 @@ module Oga
# @example # @example
# document = Oga.parse_xml('<root>Hello</root>') # document = Oga.parse_xml('<root>Hello</root>')
# #
# @param [String|IO] xml The XML input to parse. # @see [Oga::XML::Lexer#initialize]
#
# @return [Oga::XML::Document] # @return [Oga::XML::Document]
# #
def self.parse_xml(xml) def self.parse_xml(xml, options = {})
return XML::Parser.new(xml).parse return XML::Parser.new(xml, options).parse
end end
## ##
@ -18,11 +19,12 @@ module Oga
# @example # @example
# document = Oga.parse_html('<html>...</html>') # document = Oga.parse_html('<html>...</html>')
# #
# @param [String|IO] html The HTML input to parse. # @see [Oga::XML::Lexer#initialize]
#
# @return [Oga::XML::Document] # @return [Oga::XML::Document]
# #
def self.parse_html(html) def self.parse_html(html, options = {})
return HTML::Parser.new(html).parse return HTML::Parser.new(html, options).parse
end end
## ##
@ -33,11 +35,10 @@ module Oga
# #
# Oga.sax_parse_html(handler, '<root>Hello</root>') # Oga.sax_parse_html(handler, '<root>Hello</root>')
# #
# @param [Object] handler The SAX handler for the parser. # @see [Oga::XML::SaxParser#initialize]
# @param [String|IO] xml The XML to parse.
# #
def self.sax_parse_xml(handler, xml) def self.sax_parse_xml(handler, xml, options = {})
XML::SaxParser.new(handler, xml).parse XML::SaxParser.new(handler, xml, options).parse
end end
## ##
@ -48,10 +49,9 @@ module Oga
# #
# Oga.sax_parse_html(handler, '<script>foo()</script>') # Oga.sax_parse_html(handler, '<script>foo()</script>')
# #
# @param [Object] handler The SAX handler for the parser. # @see [Oga::XML::SaxParser#initialize]
# @param [String|IO] html The HTML to parse.
# #
def self.sax_parse_html(handler, html) def self.sax_parse_html(handler, html, options = {})
HTML::SaxParser.new(handler, html).parse HTML::SaxParser.new(handler, html, options).parse
end end
end # Oga end # Oga

View File

@ -34,6 +34,16 @@ module Oga
# However, it is perfectly save to use different instances per thread. # However, it is perfectly save to use different instances per thread.
# There is no _global_ state used by this lexer. # There is no _global_ state used by this lexer.
# #
# ## Strict Mode
#
# By default the lexer is rather permissive regarding the input. For
# example, missing closing tags are inserted by default. To disable this
# behaviour the lexer can be run in "strict mode" by setting `:strict` to
# `true`:
#
# lexer = Oga::XML::Lexer.new('...', :strict => true)
#
# Strict mode only applies to XML documents.
# #
# @private # @private
# #
@ -97,13 +107,17 @@ module Oga
# #
# @param [Hash] options # @param [Hash] options
# #
# @option options [Symbol] :html When set to `true` the lexer will treat # @option options [TrueClass|FalseClass] :html When set to `true` the
# the input as HTML instead of SGML/XML. This makes it possible to lex # lexer will treat the input as HTML instead of XML. This makes it
# HTML void elements such as `<link href="">`. # possible to lex HTML void elements such as `<link href="">`.
#
# @option options [TrueClass|FalseClass] :strict Enables/disables strict
# parsing of XML documents, disabled by default.
# #
def initialize(data, options = {}) def initialize(data, options = {})
@data = data @data = data
@html = options[:html] @html = options[:html]
@strict = options[:strict] || false
reset reset
end end
@ -191,7 +205,7 @@ module Oga
end end
# Add any missing closing tags # Add any missing closing tags
unless @elements.empty? if !strict? and !@elements.empty?
@elements.length.times { on_element_end } @elements.length.times { on_element_end }
end end
ensure ensure
@ -205,6 +219,13 @@ module Oga
return @html == true return @html == true
end end
##
# @return [TrueClass|FalseClass]
#
def strict?
return @strict
end
## ##
# @return [TrueClass|FalseClass] # @return [TrueClass|FalseClass]
# #

View File

@ -1,17 +1,27 @@
require 'spec_helper' require 'spec_helper'
describe Oga do describe Oga do
describe 'parsing XML' do
it 'parses an XML document' do it 'parses an XML document' do
document = described_class.parse_xml('<root>foo</root>') document = described_class.parse_xml('<root>foo</root>')
document.is_a?(Oga::XML::Document).should == true document.is_a?(Oga::XML::Document).should == true
end end
it 'raises an error when parsing an invalid document in strict mode' do
block = proc { described_class.parse_xml('<root>foo', :strict => true) }
block.should raise_error(LL::ParserError)
end
end
describe 'parsing HTML' do
it 'parses an HTML document' do it 'parses an HTML document' do
document = described_class.parse_xml('<html><body></body></html>') document = described_class.parse_xml('<html><body></body></html>')
document.is_a?(Oga::XML::Document).should == true document.is_a?(Oga::XML::Document).should == true
end end
end
describe 'SAX parsing' do describe 'SAX parsing' do
before do before do
@ -27,13 +37,19 @@ describe Oga do
end end
it 'parses an XML document using the SAX parser' do it 'parses an XML document using the SAX parser' do
Oga.sax_parse_xml(@handler, '<foo />') described_class.sax_parse_xml(@handler, '<foo />')
@handler.name.should == 'foo' @handler.name.should == 'foo'
end end
it 'raises an error when parsing an invalid XML document in strict mode' do
block = proc { Oga.sax_parse_xml(@handler, '<foo>', :strict => true) }
block.should raise_error(LL::ParserError)
end
it 'parses an HTML document using the SAX parser' do it 'parses an HTML document using the SAX parser' do
Oga.sax_parse_html(@handler, '<link>') described_class.sax_parse_html(@handler, '<link>')
@handler.name.should == 'link' @handler.name.should == 'link'
end end

View File

@ -0,0 +1,12 @@
require 'spec_helper'
describe Oga::XML::Lexer do
describe 'lexing XML using strict mode' do
it 'does not automatically insert missing closing tags' do
lex('<foo>bar', :strict => true).should == [
[:T_ELEM_NAME, 'foo', 1],
[:T_TEXT, 'bar', 1]
]
end
end
end