From 13e2c3d82ffb9f32b863cb47f6808cf061e07095 Mon Sep 17 00:00:00 2001
From: Yorick Peterse
Date: Sun, 19 Apr 2015 23:19:02 +0200
Subject: [PATCH] Better handling of incorrect XML/HTML tags
The XML/HTML lexer is now capable of processing most invalid XML/HTML
(that I can think of at least). This is achieved by inserting missing
closing tags (where needed) and/or ignoring excessive closing tags. For
example, HTML such as this:
Results in the following tokens:
[:T_ELEM_START, nil, 1]
[:T_ELEM_NAME, 'a', 1]
[:T_ELEM_CLOSE, nil, 1]
In turn this HTML:
Results in these tokens:
[:T_ELEM_START, nil, 1]
[:T_ELEM_NAME, 'a', 1]
[:T_ELEM_CLOSE, nil, 1]
Fixes #84
---
lib/oga/xml/lexer.rb | 11 +++++++--
spec/oga/xml/lexer/elements_spec.rb | 12 ++++++----
spec/oga/xml/lexer/invalid_elements_spec.rb | 25 +++++++++++++++++++++
spec/oga/xml/parser/error_spec.rb | 15 ++++---------
4 files changed, 46 insertions(+), 17 deletions(-)
create mode 100644 spec/oga/xml/lexer/invalid_elements_spec.rb
diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb
index 7038275..63c71a4 100644
--- a/lib/oga/xml/lexer.rb
+++ b/lib/oga/xml/lexer.rb
@@ -151,6 +151,11 @@ module Oga
read_data do |chunk|
advance_native(chunk)
end
+
+ # Add any missing closing tags
+ unless @elements.empty?
+ @elements.length.times { on_element_end }
+ end
ensure
@block = nil
end
@@ -377,7 +382,7 @@ module Oga
# @param [String] name The name of the element, including namespace.
#
def on_element_name(name)
- @elements << name if html?
+ @elements << name
add_token(:T_ELEM_NAME, name)
end
@@ -410,9 +415,11 @@ module Oga
# Called on the closing tag of an element.
#
def on_element_end
+ return if @elements.empty?
+
add_token(:T_ELEM_END)
- @elements.pop if html?
+ @elements.pop
end
##
diff --git a/spec/oga/xml/lexer/elements_spec.rb b/spec/oga/xml/lexer/elements_spec.rb
index f5d4851..0918811 100644
--- a/spec/oga/xml/lexer/elements_spec.rb
+++ b/spec/oga/xml/lexer/elements_spec.rb
@@ -5,21 +5,24 @@ describe Oga::XML::Lexer do
it 'lexes an opening element' do
lex('').should == [
[:T_ELEM_START, nil, 1],
- [:T_ELEM_NAME, 'p', 1]
+ [:T_ELEM_NAME, 'p', 1],
+ [:T_ELEM_END, nil, 1]
]
end
it 'lexes an opening element with a stray double quote' do
lex('
').should == [
[:T_ELEM_START, nil, 1],
- [:T_ELEM_NAME, 'p', 1]
+ [:T_ELEM_NAME, 'p', 1],
+ [:T_ELEM_END, nil, 1]
]
end
it 'lexes an opening element with a stray double quoted string' do
lex('
').should == [
[:T_ELEM_START, nil, 1],
- [:T_ELEM_NAME, 'p', 1]
+ [:T_ELEM_NAME, 'p', 1],
+ [:T_ELEM_END, nil, 1]
]
end
@@ -60,7 +63,8 @@ describe Oga::XML::Lexer do
lex('Foo
').should == [
[:T_TEXT, 'Foo', 1],
[:T_ELEM_START, nil, 1],
- [:T_ELEM_NAME, 'p', 1]
+ [:T_ELEM_NAME, 'p', 1],
+ [:T_ELEM_END, nil, 1]
]
end
diff --git a/spec/oga/xml/lexer/invalid_elements_spec.rb b/spec/oga/xml/lexer/invalid_elements_spec.rb
new file mode 100644
index 0000000..92ac94e
--- /dev/null
+++ b/spec/oga/xml/lexer/invalid_elements_spec.rb
@@ -0,0 +1,25 @@
+require 'spec_helper'
+
+describe Oga::XML::Lexer do
+ describe 'invalid elements' do
+ it 'adds missing closing tags' do
+ lex('').should == [
+ [:T_ELEM_START, nil, 1],
+ [:T_ELEM_NAME, 'a', 1],
+ [:T_ELEM_END, nil, 1]
+ ]
+ end
+
+ it 'ignores closing tags without opening tags' do
+ lex('').should == []
+ end
+
+ it 'ignores excessive closing tags' do
+ lex('').should == [
+ [:T_ELEM_START, nil, 1],
+ [:T_ELEM_NAME, 'a', 1],
+ [:T_ELEM_END, nil, 1]
+ ]
+ end
+ end
+end
diff --git a/spec/oga/xml/parser/error_spec.rb b/spec/oga/xml/parser/error_spec.rb
index aa74b38..66e1894 100644
--- a/spec/oga/xml/parser/error_spec.rb
+++ b/spec/oga/xml/parser/error_spec.rb
@@ -3,13 +3,7 @@ require 'spec_helper'
describe Oga::XML::Parser do
describe 'raising syntax errors' do
before do
- @invalid_xml = <<-EOF.strip
-
- Alice
- 25
- Dutch
-
- EOF
+ @invalid_xml = ''
end
it 'raises a LL::ParserError' do
@@ -17,16 +11,15 @@ describe Oga::XML::Parser do
end
it 'includes the line number when using a String as input' do
- parse_error(@invalid_xml).should =~ /on line 5/
+ parse_error(@invalid_xml).should =~ /on line 1/
end
it 'includes the line number when using an IO as input' do
- parse_error(StringIO.new(@invalid_xml)).should =~ /on line 5/
+ parse_error(StringIO.new(@invalid_xml)).should =~ /on line 1/
end
it 'uses more friendly error messages when available' do
- parse_error('').should ==
- 'Unexpected end of input, expected element closing tag instead on line 1'
+ parse_error('').should =~ /Unexpected element namespace/
end
end
end