From 13e2c3d82ffb9f32b863cb47f6808cf061e07095 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Sun, 19 Apr 2015 23:19:02 +0200 Subject: [PATCH] Better handling of incorrect XML/HTML tags The XML/HTML lexer is now capable of processing most invalid XML/HTML (that I can think of at least). This is achieved by inserting missing closing tags (where needed) and/or ignoring excessive closing tags. For example, HTML such as this:

Results in the following tokens: [:T_ELEM_START, nil, 1] [:T_ELEM_NAME, 'a', 1] [:T_ELEM_CLOSE, nil, 1] In turn this HTML: Results in these tokens: [:T_ELEM_START, nil, 1] [:T_ELEM_NAME, 'a', 1] [:T_ELEM_CLOSE, nil, 1] Fixes #84 --- lib/oga/xml/lexer.rb | 11 +++++++-- spec/oga/xml/lexer/elements_spec.rb | 12 ++++++---- spec/oga/xml/lexer/invalid_elements_spec.rb | 25 +++++++++++++++++++++ spec/oga/xml/parser/error_spec.rb | 15 ++++--------- 4 files changed, 46 insertions(+), 17 deletions(-) create mode 100644 spec/oga/xml/lexer/invalid_elements_spec.rb diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb index 7038275..63c71a4 100644 --- a/lib/oga/xml/lexer.rb +++ b/lib/oga/xml/lexer.rb @@ -151,6 +151,11 @@ module Oga read_data do |chunk| advance_native(chunk) end + + # Add any missing closing tags + unless @elements.empty? + @elements.length.times { on_element_end } + end ensure @block = nil end @@ -377,7 +382,7 @@ module Oga # @param [String] name The name of the element, including namespace. # def on_element_name(name) - @elements << name if html? + @elements << name add_token(:T_ELEM_NAME, name) end @@ -410,9 +415,11 @@ module Oga # Called on the closing tag of an element. # def on_element_end + return if @elements.empty? + add_token(:T_ELEM_END) - @elements.pop if html? + @elements.pop end ## diff --git a/spec/oga/xml/lexer/elements_spec.rb b/spec/oga/xml/lexer/elements_spec.rb index f5d4851..0918811 100644 --- a/spec/oga/xml/lexer/elements_spec.rb +++ b/spec/oga/xml/lexer/elements_spec.rb @@ -5,21 +5,24 @@ describe Oga::XML::Lexer do it 'lexes an opening element' do lex('

').should == [ [:T_ELEM_START, nil, 1], - [:T_ELEM_NAME, 'p', 1] + [:T_ELEM_NAME, 'p', 1], + [:T_ELEM_END, nil, 1] ] end it 'lexes an opening element with a stray double quote' do lex('').should == [ [:T_ELEM_START, nil, 1], - [:T_ELEM_NAME, 'p', 1] + [:T_ELEM_NAME, 'p', 1], + [:T_ELEM_END, nil, 1] ] end it 'lexes an opening element with a stray double quoted string' do lex('').should == [ [:T_ELEM_START, nil, 1], - [:T_ELEM_NAME, 'p', 1] + [:T_ELEM_NAME, 'p', 1], + [:T_ELEM_END, nil, 1] ] end @@ -60,7 +63,8 @@ describe Oga::XML::Lexer do lex('Foo

').should == [ [:T_TEXT, 'Foo', 1], [:T_ELEM_START, nil, 1], - [:T_ELEM_NAME, 'p', 1] + [:T_ELEM_NAME, 'p', 1], + [:T_ELEM_END, nil, 1] ] end diff --git a/spec/oga/xml/lexer/invalid_elements_spec.rb b/spec/oga/xml/lexer/invalid_elements_spec.rb new file mode 100644 index 0000000..92ac94e --- /dev/null +++ b/spec/oga/xml/lexer/invalid_elements_spec.rb @@ -0,0 +1,25 @@ +require 'spec_helper' + +describe Oga::XML::Lexer do + describe 'invalid elements' do + it 'adds missing closing tags' do + lex('').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'a', 1], + [:T_ELEM_END, nil, 1] + ] + end + + it 'ignores closing tags without opening tags' do + lex('').should == [] + end + + it 'ignores excessive closing tags' do + lex('').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'a', 1], + [:T_ELEM_END, nil, 1] + ] + end + end +end diff --git a/spec/oga/xml/parser/error_spec.rb b/spec/oga/xml/parser/error_spec.rb index aa74b38..66e1894 100644 --- a/spec/oga/xml/parser/error_spec.rb +++ b/spec/oga/xml/parser/error_spec.rb @@ -3,13 +3,7 @@ require 'spec_helper' describe Oga::XML::Parser do describe 'raising syntax errors' do before do - @invalid_xml = <<-EOF.strip - - Alice - 25 - Dutch - - EOF + @invalid_xml = '' end it 'raises a LL::ParserError' do @@ -17,16 +11,15 @@ describe Oga::XML::Parser do end it 'includes the line number when using a String as input' do - parse_error(@invalid_xml).should =~ /on line 5/ + parse_error(@invalid_xml).should =~ /on line 1/ end it 'includes the line number when using an IO as input' do - parse_error(StringIO.new(@invalid_xml)).should =~ /on line 5/ + parse_error(StringIO.new(@invalid_xml)).should =~ /on line 1/ end it 'uses more friendly error messages when available' do - parse_error('').should == - 'Unexpected end of input, expected element closing tag instead on line 1' + parse_error('').should =~ /Unexpected element namespace/ end end end