Better handling of incorrect XML/HTML tags

The XML/HTML lexer is now capable of processing most invalid XML/HTML
(that I can think of at least). This is achieved by inserting missing
closing tags (where needed) and/or ignoring excessive closing tags. For
example, HTML such as this:

    <a></a></p>

Results in the following tokens:

    [:T_ELEM_START, nil, 1]
    [:T_ELEM_NAME, 'a', 1]
    [:T_ELEM_CLOSE, nil, 1]

In turn this HTML:

    <a>

Results in these tokens:

    [:T_ELEM_START, nil, 1]
    [:T_ELEM_NAME, 'a', 1]
    [:T_ELEM_CLOSE, nil, 1]

Fixes #84
This commit is contained in:
Yorick Peterse 2015-04-19 23:19:02 +02:00
parent 84e1bfc955
commit 13e2c3d82f
4 changed files with 46 additions and 17 deletions

View File

@ -151,6 +151,11 @@ module Oga
read_data do |chunk|
advance_native(chunk)
end
# Add any missing closing tags
unless @elements.empty?
@elements.length.times { on_element_end }
end
ensure
@block = nil
end
@ -377,7 +382,7 @@ module Oga
# @param [String] name The name of the element, including namespace.
#
def on_element_name(name)
@elements << name if html?
@elements << name
add_token(:T_ELEM_NAME, name)
end
@ -410,9 +415,11 @@ module Oga
# Called on the closing tag of an element.
#
def on_element_end
return if @elements.empty?
add_token(:T_ELEM_END)
@elements.pop if html?
@elements.pop
end
##

View File

@ -5,21 +5,24 @@ describe Oga::XML::Lexer do
it 'lexes an opening element' do
lex('<p>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'p', 1]
[:T_ELEM_NAME, 'p', 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes an opening element with a stray double quote' do
lex('<p">').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'p', 1]
[:T_ELEM_NAME, 'p', 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes an opening element with a stray double quoted string' do
lex('<p"">').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'p', 1]
[:T_ELEM_NAME, 'p', 1],
[:T_ELEM_END, nil, 1]
]
end
@ -60,7 +63,8 @@ describe Oga::XML::Lexer do
lex('Foo<p>').should == [
[:T_TEXT, 'Foo', 1],
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'p', 1]
[:T_ELEM_NAME, 'p', 1],
[:T_ELEM_END, nil, 1]
]
end

View File

@ -0,0 +1,25 @@
require 'spec_helper'
describe Oga::XML::Lexer do
describe 'invalid elements' do
it 'adds missing closing tags' do
lex('<a>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'a', 1],
[:T_ELEM_END, nil, 1]
]
end
it 'ignores closing tags without opening tags' do
lex('</a>').should == []
end
it 'ignores excessive closing tags' do
lex('<a></a></b>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'a', 1],
[:T_ELEM_END, nil, 1]
]
end
end
end

View File

@ -3,13 +3,7 @@ require 'spec_helper'
describe Oga::XML::Parser do
describe 'raising syntax errors' do
before do
@invalid_xml = <<-EOF.strip
<person>
<name>Alice</name>
<age>25
<nationality>Dutch</nationality>
</person>
EOF
@invalid_xml = '<x:y:z></z>'
end
it 'raises a LL::ParserError' do
@ -17,16 +11,15 @@ describe Oga::XML::Parser do
end
it 'includes the line number when using a String as input' do
parse_error(@invalid_xml).should =~ /on line 5/
parse_error(@invalid_xml).should =~ /on line 1/
end
it 'includes the line number when using an IO as input' do
parse_error(StringIO.new(@invalid_xml)).should =~ /on line 5/
parse_error(StringIO.new(@invalid_xml)).should =~ /on line 1/
end
it 'uses more friendly error messages when available' do
parse_error('<foo>').should ==
'Unexpected end of input, expected element closing tag instead on line 1'
parse_error('<x:y:z>').should =~ /Unexpected element namespace/
end
end
end