Support for strict parsing of XML documents
Currently this only disabled the automatic insertion of closing tags, in the future this may also disable other features if deemed worth the effort. Fixes #107
This commit is contained in:
parent
4031c4f843
commit
2c18a51ba9
|
@ -28,6 +28,11 @@ Parsing a simple string of XML:
|
||||||
|
|
||||||
Oga.parse_xml('<people><person>Alice</person></people>')
|
Oga.parse_xml('<people><person>Alice</person></people>')
|
||||||
|
|
||||||
|
Parsing XML using strict mode (disables automatic tag insertion):
|
||||||
|
|
||||||
|
Oga.parse_xml('<people>foo</people>', :strict => true) # works fine
|
||||||
|
Oga.parse_xml('<people>foo', :strict => true) # throws an error
|
||||||
|
|
||||||
Parsing a simple string of HTML:
|
Parsing a simple string of HTML:
|
||||||
|
|
||||||
Oga.parse_html('<link rel="stylesheet" href="foo.css">')
|
Oga.parse_html('<link rel="stylesheet" href="foo.css">')
|
||||||
|
|
|
@ -5,11 +5,12 @@ module Oga
|
||||||
# @example
|
# @example
|
||||||
# document = Oga.parse_xml('<root>Hello</root>')
|
# document = Oga.parse_xml('<root>Hello</root>')
|
||||||
#
|
#
|
||||||
# @param [String|IO] xml The XML input to parse.
|
# @see [Oga::XML::Lexer#initialize]
|
||||||
|
#
|
||||||
# @return [Oga::XML::Document]
|
# @return [Oga::XML::Document]
|
||||||
#
|
#
|
||||||
def self.parse_xml(xml)
|
def self.parse_xml(xml, options = {})
|
||||||
return XML::Parser.new(xml).parse
|
return XML::Parser.new(xml, options).parse
|
||||||
end
|
end
|
||||||
|
|
||||||
##
|
##
|
||||||
|
@ -18,11 +19,12 @@ module Oga
|
||||||
# @example
|
# @example
|
||||||
# document = Oga.parse_html('<html>...</html>')
|
# document = Oga.parse_html('<html>...</html>')
|
||||||
#
|
#
|
||||||
# @param [String|IO] html The HTML input to parse.
|
# @see [Oga::XML::Lexer#initialize]
|
||||||
|
#
|
||||||
# @return [Oga::XML::Document]
|
# @return [Oga::XML::Document]
|
||||||
#
|
#
|
||||||
def self.parse_html(html)
|
def self.parse_html(html, options = {})
|
||||||
return HTML::Parser.new(html).parse
|
return HTML::Parser.new(html, options).parse
|
||||||
end
|
end
|
||||||
|
|
||||||
##
|
##
|
||||||
|
@ -33,11 +35,10 @@ module Oga
|
||||||
#
|
#
|
||||||
# Oga.sax_parse_html(handler, '<root>Hello</root>')
|
# Oga.sax_parse_html(handler, '<root>Hello</root>')
|
||||||
#
|
#
|
||||||
# @param [Object] handler The SAX handler for the parser.
|
# @see [Oga::XML::SaxParser#initialize]
|
||||||
# @param [String|IO] xml The XML to parse.
|
|
||||||
#
|
#
|
||||||
def self.sax_parse_xml(handler, xml)
|
def self.sax_parse_xml(handler, xml, options = {})
|
||||||
XML::SaxParser.new(handler, xml).parse
|
XML::SaxParser.new(handler, xml, options).parse
|
||||||
end
|
end
|
||||||
|
|
||||||
##
|
##
|
||||||
|
@ -48,10 +49,9 @@ module Oga
|
||||||
#
|
#
|
||||||
# Oga.sax_parse_html(handler, '<script>foo()</script>')
|
# Oga.sax_parse_html(handler, '<script>foo()</script>')
|
||||||
#
|
#
|
||||||
# @param [Object] handler The SAX handler for the parser.
|
# @see [Oga::XML::SaxParser#initialize]
|
||||||
# @param [String|IO] html The HTML to parse.
|
|
||||||
#
|
#
|
||||||
def self.sax_parse_html(handler, html)
|
def self.sax_parse_html(handler, html, options = {})
|
||||||
HTML::SaxParser.new(handler, html).parse
|
HTML::SaxParser.new(handler, html, options).parse
|
||||||
end
|
end
|
||||||
end # Oga
|
end # Oga
|
||||||
|
|
|
@ -34,6 +34,16 @@ module Oga
|
||||||
# However, it is perfectly save to use different instances per thread.
|
# However, it is perfectly save to use different instances per thread.
|
||||||
# There is no _global_ state used by this lexer.
|
# There is no _global_ state used by this lexer.
|
||||||
#
|
#
|
||||||
|
# ## Strict Mode
|
||||||
|
#
|
||||||
|
# By default the lexer is rather permissive regarding the input. For
|
||||||
|
# example, missing closing tags are inserted by default. To disable this
|
||||||
|
# behaviour the lexer can be run in "strict mode" by setting `:strict` to
|
||||||
|
# `true`:
|
||||||
|
#
|
||||||
|
# lexer = Oga::XML::Lexer.new('...', :strict => true)
|
||||||
|
#
|
||||||
|
# Strict mode only applies to XML documents.
|
||||||
#
|
#
|
||||||
# @private
|
# @private
|
||||||
#
|
#
|
||||||
|
@ -97,13 +107,17 @@ module Oga
|
||||||
#
|
#
|
||||||
# @param [Hash] options
|
# @param [Hash] options
|
||||||
#
|
#
|
||||||
# @option options [Symbol] :html When set to `true` the lexer will treat
|
# @option options [TrueClass|FalseClass] :html When set to `true` the
|
||||||
# the input as HTML instead of SGML/XML. This makes it possible to lex
|
# lexer will treat the input as HTML instead of XML. This makes it
|
||||||
# HTML void elements such as `<link href="">`.
|
# possible to lex HTML void elements such as `<link href="">`.
|
||||||
|
#
|
||||||
|
# @option options [TrueClass|FalseClass] :strict Enables/disables strict
|
||||||
|
# parsing of XML documents, disabled by default.
|
||||||
#
|
#
|
||||||
def initialize(data, options = {})
|
def initialize(data, options = {})
|
||||||
@data = data
|
@data = data
|
||||||
@html = options[:html]
|
@html = options[:html]
|
||||||
|
@strict = options[:strict] || false
|
||||||
|
|
||||||
reset
|
reset
|
||||||
end
|
end
|
||||||
|
@ -191,7 +205,7 @@ module Oga
|
||||||
end
|
end
|
||||||
|
|
||||||
# Add any missing closing tags
|
# Add any missing closing tags
|
||||||
unless @elements.empty?
|
if !strict? and !@elements.empty?
|
||||||
@elements.length.times { on_element_end }
|
@elements.length.times { on_element_end }
|
||||||
end
|
end
|
||||||
ensure
|
ensure
|
||||||
|
@ -205,6 +219,13 @@ module Oga
|
||||||
return @html == true
|
return @html == true
|
||||||
end
|
end
|
||||||
|
|
||||||
|
##
|
||||||
|
# @return [TrueClass|FalseClass]
|
||||||
|
#
|
||||||
|
def strict?
|
||||||
|
return @strict
|
||||||
|
end
|
||||||
|
|
||||||
##
|
##
|
||||||
# @return [TrueClass|FalseClass]
|
# @return [TrueClass|FalseClass]
|
||||||
#
|
#
|
||||||
|
|
|
@ -1,16 +1,26 @@
|
||||||
require 'spec_helper'
|
require 'spec_helper'
|
||||||
|
|
||||||
describe Oga do
|
describe Oga do
|
||||||
it 'parses an XML document' do
|
describe 'parsing XML' do
|
||||||
document = described_class.parse_xml('<root>foo</root>')
|
it 'parses an XML document' do
|
||||||
|
document = described_class.parse_xml('<root>foo</root>')
|
||||||
|
|
||||||
document.is_a?(Oga::XML::Document).should == true
|
document.is_a?(Oga::XML::Document).should == true
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'raises an error when parsing an invalid document in strict mode' do
|
||||||
|
block = proc { described_class.parse_xml('<root>foo', :strict => true) }
|
||||||
|
|
||||||
|
block.should raise_error(LL::ParserError)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'parses an HTML document' do
|
describe 'parsing HTML' do
|
||||||
document = described_class.parse_xml('<html><body></body></html>')
|
it 'parses an HTML document' do
|
||||||
|
document = described_class.parse_xml('<html><body></body></html>')
|
||||||
|
|
||||||
document.is_a?(Oga::XML::Document).should == true
|
document.is_a?(Oga::XML::Document).should == true
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
describe 'SAX parsing' do
|
describe 'SAX parsing' do
|
||||||
|
@ -27,13 +37,19 @@ describe Oga do
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'parses an XML document using the SAX parser' do
|
it 'parses an XML document using the SAX parser' do
|
||||||
Oga.sax_parse_xml(@handler, '<foo />')
|
described_class.sax_parse_xml(@handler, '<foo />')
|
||||||
|
|
||||||
@handler.name.should == 'foo'
|
@handler.name.should == 'foo'
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it 'raises an error when parsing an invalid XML document in strict mode' do
|
||||||
|
block = proc { Oga.sax_parse_xml(@handler, '<foo>', :strict => true) }
|
||||||
|
|
||||||
|
block.should raise_error(LL::ParserError)
|
||||||
|
end
|
||||||
|
|
||||||
it 'parses an HTML document using the SAX parser' do
|
it 'parses an HTML document using the SAX parser' do
|
||||||
Oga.sax_parse_html(@handler, '<link>')
|
described_class.sax_parse_html(@handler, '<link>')
|
||||||
|
|
||||||
@handler.name.should == 'link'
|
@handler.name.should == 'link'
|
||||||
end
|
end
|
||||||
|
|
|
@ -0,0 +1,12 @@
|
||||||
|
require 'spec_helper'
|
||||||
|
|
||||||
|
describe Oga::XML::Lexer do
|
||||||
|
describe 'lexing XML using strict mode' do
|
||||||
|
it 'does not automatically insert missing closing tags' do
|
||||||
|
lex('<foo>bar', :strict => true).should == [
|
||||||
|
[:T_ELEM_NAME, 'foo', 1],
|
||||||
|
[:T_TEXT, 'bar', 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in New Issue