Better handling of incorrect XML/HTML tags
The XML/HTML lexer is now capable of processing most invalid XML/HTML (that I can think of at least). This is achieved by inserting missing closing tags (where needed) and/or ignoring excessive closing tags. For example, HTML such as this: <a></a></p> Results in the following tokens: [:T_ELEM_START, nil, 1] [:T_ELEM_NAME, 'a', 1] [:T_ELEM_CLOSE, nil, 1] In turn this HTML: <a> Results in these tokens: [:T_ELEM_START, nil, 1] [:T_ELEM_NAME, 'a', 1] [:T_ELEM_CLOSE, nil, 1] Fixes #84
This commit is contained in:
parent
84e1bfc955
commit
13e2c3d82f
|
@ -151,6 +151,11 @@ module Oga
|
|||
read_data do |chunk|
|
||||
advance_native(chunk)
|
||||
end
|
||||
|
||||
# Add any missing closing tags
|
||||
unless @elements.empty?
|
||||
@elements.length.times { on_element_end }
|
||||
end
|
||||
ensure
|
||||
@block = nil
|
||||
end
|
||||
|
@ -377,7 +382,7 @@ module Oga
|
|||
# @param [String] name The name of the element, including namespace.
|
||||
#
|
||||
def on_element_name(name)
|
||||
@elements << name if html?
|
||||
@elements << name
|
||||
|
||||
add_token(:T_ELEM_NAME, name)
|
||||
end
|
||||
|
@ -410,9 +415,11 @@ module Oga
|
|||
# Called on the closing tag of an element.
|
||||
#
|
||||
def on_element_end
|
||||
return if @elements.empty?
|
||||
|
||||
add_token(:T_ELEM_END)
|
||||
|
||||
@elements.pop if html?
|
||||
@elements.pop
|
||||
end
|
||||
|
||||
##
|
||||
|
|
|
@ -5,21 +5,24 @@ describe Oga::XML::Lexer do
|
|||
it 'lexes an opening element' do
|
||||
lex('<p>').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'p', 1]
|
||||
[:T_ELEM_NAME, 'p', 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes an opening element with a stray double quote' do
|
||||
lex('<p">').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'p', 1]
|
||||
[:T_ELEM_NAME, 'p', 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes an opening element with a stray double quoted string' do
|
||||
lex('<p"">').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'p', 1]
|
||||
[:T_ELEM_NAME, 'p', 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
|
@ -60,7 +63,8 @@ describe Oga::XML::Lexer do
|
|||
lex('Foo<p>').should == [
|
||||
[:T_TEXT, 'Foo', 1],
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'p', 1]
|
||||
[:T_ELEM_NAME, 'p', 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::XML::Lexer do
|
||||
describe 'invalid elements' do
|
||||
it 'adds missing closing tags' do
|
||||
lex('<a>').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'a', 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'ignores closing tags without opening tags' do
|
||||
lex('</a>').should == []
|
||||
end
|
||||
|
||||
it 'ignores excessive closing tags' do
|
||||
lex('<a></a></b>').should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'a', 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
end
|
||||
end
|
|
@ -3,13 +3,7 @@ require 'spec_helper'
|
|||
describe Oga::XML::Parser do
|
||||
describe 'raising syntax errors' do
|
||||
before do
|
||||
@invalid_xml = <<-EOF.strip
|
||||
<person>
|
||||
<name>Alice</name>
|
||||
<age>25
|
||||
<nationality>Dutch</nationality>
|
||||
</person>
|
||||
EOF
|
||||
@invalid_xml = '<x:y:z></z>'
|
||||
end
|
||||
|
||||
it 'raises a LL::ParserError' do
|
||||
|
@ -17,16 +11,15 @@ describe Oga::XML::Parser do
|
|||
end
|
||||
|
||||
it 'includes the line number when using a String as input' do
|
||||
parse_error(@invalid_xml).should =~ /on line 5/
|
||||
parse_error(@invalid_xml).should =~ /on line 1/
|
||||
end
|
||||
|
||||
it 'includes the line number when using an IO as input' do
|
||||
parse_error(StringIO.new(@invalid_xml)).should =~ /on line 5/
|
||||
parse_error(StringIO.new(@invalid_xml)).should =~ /on line 1/
|
||||
end
|
||||
|
||||
it 'uses more friendly error messages when available' do
|
||||
parse_error('<foo>').should ==
|
||||
'Unexpected end of input, expected element closing tag instead on line 1'
|
||||
parse_error('<x:y:z>').should =~ /Unexpected element namespace/
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue