Better handling of incorrect XML/HTML tags
The XML/HTML lexer is now capable of processing most invalid XML/HTML (that I can think of at least). This is achieved by inserting missing closing tags (where needed) and/or ignoring excessive closing tags. For example, HTML such as this: <a></a></p> Results in the following tokens: [:T_ELEM_START, nil, 1] [:T_ELEM_NAME, 'a', 1] [:T_ELEM_CLOSE, nil, 1] In turn this HTML: <a> Results in these tokens: [:T_ELEM_START, nil, 1] [:T_ELEM_NAME, 'a', 1] [:T_ELEM_CLOSE, nil, 1] Fixes #84
This commit is contained in:
parent
84e1bfc955
commit
13e2c3d82f
|
@ -151,6 +151,11 @@ module Oga
|
||||||
read_data do |chunk|
|
read_data do |chunk|
|
||||||
advance_native(chunk)
|
advance_native(chunk)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Add any missing closing tags
|
||||||
|
unless @elements.empty?
|
||||||
|
@elements.length.times { on_element_end }
|
||||||
|
end
|
||||||
ensure
|
ensure
|
||||||
@block = nil
|
@block = nil
|
||||||
end
|
end
|
||||||
|
@ -377,7 +382,7 @@ module Oga
|
||||||
# @param [String] name The name of the element, including namespace.
|
# @param [String] name The name of the element, including namespace.
|
||||||
#
|
#
|
||||||
def on_element_name(name)
|
def on_element_name(name)
|
||||||
@elements << name if html?
|
@elements << name
|
||||||
|
|
||||||
add_token(:T_ELEM_NAME, name)
|
add_token(:T_ELEM_NAME, name)
|
||||||
end
|
end
|
||||||
|
@ -410,9 +415,11 @@ module Oga
|
||||||
# Called on the closing tag of an element.
|
# Called on the closing tag of an element.
|
||||||
#
|
#
|
||||||
def on_element_end
|
def on_element_end
|
||||||
|
return if @elements.empty?
|
||||||
|
|
||||||
add_token(:T_ELEM_END)
|
add_token(:T_ELEM_END)
|
||||||
|
|
||||||
@elements.pop if html?
|
@elements.pop
|
||||||
end
|
end
|
||||||
|
|
||||||
##
|
##
|
||||||
|
|
|
@ -5,21 +5,24 @@ describe Oga::XML::Lexer do
|
||||||
it 'lexes an opening element' do
|
it 'lexes an opening element' do
|
||||||
lex('<p>').should == [
|
lex('<p>').should == [
|
||||||
[:T_ELEM_START, nil, 1],
|
[:T_ELEM_START, nil, 1],
|
||||||
[:T_ELEM_NAME, 'p', 1]
|
[:T_ELEM_NAME, 'p', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'lexes an opening element with a stray double quote' do
|
it 'lexes an opening element with a stray double quote' do
|
||||||
lex('<p">').should == [
|
lex('<p">').should == [
|
||||||
[:T_ELEM_START, nil, 1],
|
[:T_ELEM_START, nil, 1],
|
||||||
[:T_ELEM_NAME, 'p', 1]
|
[:T_ELEM_NAME, 'p', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'lexes an opening element with a stray double quoted string' do
|
it 'lexes an opening element with a stray double quoted string' do
|
||||||
lex('<p"">').should == [
|
lex('<p"">').should == [
|
||||||
[:T_ELEM_START, nil, 1],
|
[:T_ELEM_START, nil, 1],
|
||||||
[:T_ELEM_NAME, 'p', 1]
|
[:T_ELEM_NAME, 'p', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -60,7 +63,8 @@ describe Oga::XML::Lexer do
|
||||||
lex('Foo<p>').should == [
|
lex('Foo<p>').should == [
|
||||||
[:T_TEXT, 'Foo', 1],
|
[:T_TEXT, 'Foo', 1],
|
||||||
[:T_ELEM_START, nil, 1],
|
[:T_ELEM_START, nil, 1],
|
||||||
[:T_ELEM_NAME, 'p', 1]
|
[:T_ELEM_NAME, 'p', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
require 'spec_helper'
|
||||||
|
|
||||||
|
describe Oga::XML::Lexer do
|
||||||
|
describe 'invalid elements' do
|
||||||
|
it 'adds missing closing tags' do
|
||||||
|
lex('<a>').should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'a', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'ignores closing tags without opening tags' do
|
||||||
|
lex('</a>').should == []
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'ignores excessive closing tags' do
|
||||||
|
lex('<a></a></b>').should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'a', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -3,13 +3,7 @@ require 'spec_helper'
|
||||||
describe Oga::XML::Parser do
|
describe Oga::XML::Parser do
|
||||||
describe 'raising syntax errors' do
|
describe 'raising syntax errors' do
|
||||||
before do
|
before do
|
||||||
@invalid_xml = <<-EOF.strip
|
@invalid_xml = '<x:y:z></z>'
|
||||||
<person>
|
|
||||||
<name>Alice</name>
|
|
||||||
<age>25
|
|
||||||
<nationality>Dutch</nationality>
|
|
||||||
</person>
|
|
||||||
EOF
|
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'raises a LL::ParserError' do
|
it 'raises a LL::ParserError' do
|
||||||
|
@ -17,16 +11,15 @@ describe Oga::XML::Parser do
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'includes the line number when using a String as input' do
|
it 'includes the line number when using a String as input' do
|
||||||
parse_error(@invalid_xml).should =~ /on line 5/
|
parse_error(@invalid_xml).should =~ /on line 1/
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'includes the line number when using an IO as input' do
|
it 'includes the line number when using an IO as input' do
|
||||||
parse_error(StringIO.new(@invalid_xml)).should =~ /on line 5/
|
parse_error(StringIO.new(@invalid_xml)).should =~ /on line 1/
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'uses more friendly error messages when available' do
|
it 'uses more friendly error messages when available' do
|
||||||
parse_error('<foo>').should ==
|
parse_error('<x:y:z>').should =~ /Unexpected element namespace/
|
||||||
'Unexpected end of input, expected element closing tag instead on line 1'
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue