Support for strict parsing of XML documents

Currently this only disabled the automatic insertion of closing tags, in
the future this may also disable other features if deemed worth the
effort.

Fixes #107
This commit is contained in:
Yorick Peterse 2015-06-15 23:53:11 +02:00
parent 4031c4f843
commit 2c18a51ba9
5 changed files with 82 additions and 28 deletions

View File

@ -28,6 +28,11 @@ Parsing a simple string of XML:
Oga.parse_xml('<people><person>Alice</person></people>')
Parsing XML using strict mode (disables automatic tag insertion):
Oga.parse_xml('<people>foo</people>', :strict => true) # works fine
Oga.parse_xml('<people>foo', :strict => true) # throws an error
Parsing a simple string of HTML:
Oga.parse_html('<link rel="stylesheet" href="foo.css">')

View File

@ -5,11 +5,12 @@ module Oga
# @example
# document = Oga.parse_xml('<root>Hello</root>')
#
# @param [String|IO] xml The XML input to parse.
# @see [Oga::XML::Lexer#initialize]
#
# @return [Oga::XML::Document]
#
def self.parse_xml(xml)
return XML::Parser.new(xml).parse
def self.parse_xml(xml, options = {})
return XML::Parser.new(xml, options).parse
end
##
@ -18,11 +19,12 @@ module Oga
# @example
# document = Oga.parse_html('<html>...</html>')
#
# @param [String|IO] html The HTML input to parse.
# @see [Oga::XML::Lexer#initialize]
#
# @return [Oga::XML::Document]
#
def self.parse_html(html)
return HTML::Parser.new(html).parse
def self.parse_html(html, options = {})
return HTML::Parser.new(html, options).parse
end
##
@ -33,11 +35,10 @@ module Oga
#
# Oga.sax_parse_html(handler, '<root>Hello</root>')
#
# @param [Object] handler The SAX handler for the parser.
# @param [String|IO] xml The XML to parse.
# @see [Oga::XML::SaxParser#initialize]
#
def self.sax_parse_xml(handler, xml)
XML::SaxParser.new(handler, xml).parse
def self.sax_parse_xml(handler, xml, options = {})
XML::SaxParser.new(handler, xml, options).parse
end
##
@ -48,10 +49,9 @@ module Oga
#
# Oga.sax_parse_html(handler, '<script>foo()</script>')
#
# @param [Object] handler The SAX handler for the parser.
# @param [String|IO] html The HTML to parse.
# @see [Oga::XML::SaxParser#initialize]
#
def self.sax_parse_html(handler, html)
HTML::SaxParser.new(handler, html).parse
def self.sax_parse_html(handler, html, options = {})
HTML::SaxParser.new(handler, html, options).parse
end
end # Oga

View File

@ -34,6 +34,16 @@ module Oga
# However, it is perfectly save to use different instances per thread.
# There is no _global_ state used by this lexer.
#
# ## Strict Mode
#
# By default the lexer is rather permissive regarding the input. For
# example, missing closing tags are inserted by default. To disable this
# behaviour the lexer can be run in "strict mode" by setting `:strict` to
# `true`:
#
# lexer = Oga::XML::Lexer.new('...', :strict => true)
#
# Strict mode only applies to XML documents.
#
# @private
#
@ -97,13 +107,17 @@ module Oga
#
# @param [Hash] options
#
# @option options [Symbol] :html When set to `true` the lexer will treat
# the input as HTML instead of SGML/XML. This makes it possible to lex
# HTML void elements such as `<link href="">`.
# @option options [TrueClass|FalseClass] :html When set to `true` the
# lexer will treat the input as HTML instead of XML. This makes it
# possible to lex HTML void elements such as `<link href="">`.
#
# @option options [TrueClass|FalseClass] :strict Enables/disables strict
# parsing of XML documents, disabled by default.
#
def initialize(data, options = {})
@data = data
@html = options[:html]
@strict = options[:strict] || false
reset
end
@ -191,7 +205,7 @@ module Oga
end
# Add any missing closing tags
unless @elements.empty?
if !strict? and !@elements.empty?
@elements.length.times { on_element_end }
end
ensure
@ -205,6 +219,13 @@ module Oga
return @html == true
end
##
# @return [TrueClass|FalseClass]
#
def strict?
return @strict
end
##
# @return [TrueClass|FalseClass]
#

View File

@ -1,17 +1,27 @@
require 'spec_helper'
describe Oga do
describe 'parsing XML' do
it 'parses an XML document' do
document = described_class.parse_xml('<root>foo</root>')
document.is_a?(Oga::XML::Document).should == true
end
it 'raises an error when parsing an invalid document in strict mode' do
block = proc { described_class.parse_xml('<root>foo', :strict => true) }
block.should raise_error(LL::ParserError)
end
end
describe 'parsing HTML' do
it 'parses an HTML document' do
document = described_class.parse_xml('<html><body></body></html>')
document.is_a?(Oga::XML::Document).should == true
end
end
describe 'SAX parsing' do
before do
@ -27,13 +37,19 @@ describe Oga do
end
it 'parses an XML document using the SAX parser' do
Oga.sax_parse_xml(@handler, '<foo />')
described_class.sax_parse_xml(@handler, '<foo />')
@handler.name.should == 'foo'
end
it 'raises an error when parsing an invalid XML document in strict mode' do
block = proc { Oga.sax_parse_xml(@handler, '<foo>', :strict => true) }
block.should raise_error(LL::ParserError)
end
it 'parses an HTML document using the SAX parser' do
Oga.sax_parse_html(@handler, '<link>')
described_class.sax_parse_html(@handler, '<link>')
@handler.name.should == 'link'
end

View File

@ -0,0 +1,12 @@
require 'spec_helper'
describe Oga::XML::Lexer do
describe 'lexing XML using strict mode' do
it 'does not automatically insert missing closing tags' do
lex('<foo>bar', :strict => true).should == [
[:T_ELEM_NAME, 'foo', 1],
[:T_TEXT, 'bar', 1]
]
end
end
end