From 1e0b7feb026d95f2b04706391a868d64b7e5de6e Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Tue, 12 May 2015 00:35:00 +0200 Subject: [PATCH] Recursively closing of parent HTML elements When closing certain HTML elements the lexer should also close whatever parent elements remain. For example, consider the following HTML: ...
Foo Bar
Here the "" element shouldn't only close the "Bar" element but also the parent "" and "" elements. This ensures we'd end up with the following HTML: ...
Foo Bar
Instead of garbage along the lines of this: ...
Foo Bar
Fixes #99 (hopefully for good this time) --- lib/oga/xml/lexer.rb | 25 +++++++++++----- .../lexer/html_closing_rules/optgroup_spec.rb | 22 ++++++++++++++ .../lexer/html_closing_rules/table_spec.rb | 30 +++++++++++++++++++ 3 files changed, 70 insertions(+), 7 deletions(-) create mode 100644 spec/oga/xml/lexer/html_closing_rules/optgroup_spec.rb create mode 100644 spec/oga/xml/lexer/html_closing_rules/table_spec.rb diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb index 6fbb583..3fbb00b 100644 --- a/lib/oga/xml/lexer.rb +++ b/lib/oga/xml/lexer.rb @@ -66,14 +66,14 @@ module Oga 'rp' => NodeNameSet.new(%w{rb rt rtc rp}), 'optgroup' => NodeNameSet.new(%w{optgroup}), 'option' => NodeNameSet.new(%w{option optgroup}), - 'colgrop' => NodeNameSet.new(%w{thead tbody tfoot}), + 'colgroup' => NodeNameSet.new(%w{thead tbody tfoot}), 'caption' => NodeNameSet.new(%w{thead tbody tfoot}), - 'thead' => NodeNameSet.new(%w{tbody tfoot}), - 'tbody' => NodeNameSet.new(%w{tbody tfoot}), - 'tfoot' => NodeNameSet.new(%w{tbody}), - 'tr' => NodeNameSet.new(%w{tr}), - 'td' => NodeNameSet.new(%w{td th}), - 'th' => NodeNameSet.new(%w{td th}), + 'thead' => NodeNameSet.new(%w{thead tbody tfoot}), + 'tbody' => NodeNameSet.new(%w{thead tbody tfoot}), + 'tfoot' => NodeNameSet.new(%w{thead tbody tfoot}), + 'tr' => NodeNameSet.new(%w{tr tbody thead tfoot}), + 'td' => NodeNameSet.new(%w{td th tbody thead tfoot tr}), + 'th' => NodeNameSet.new(%w{td th tbody thead tfoot tr}), 'p' => NodeNameSet.new(%w{ address article aside blockquote div dl fieldset footer form h1 h2 h3 h4 h5 h6 header hgroup hr main nav ol p pre section table ul @@ -427,6 +427,17 @@ module Oga if close_current and close_current.include?(name) on_element_end end + + # Close remaining parent elements. This for example ensures that a + # "" not only closes an unclosed "" but also the surrounding, + # unclosed "". + while close_current = HTML_CLOSE_SELF[current_element] + if close_current.include?(name) + on_element_end + else + break + end + end end ## diff --git a/spec/oga/xml/lexer/html_closing_rules/optgroup_spec.rb b/spec/oga/xml/lexer/html_closing_rules/optgroup_spec.rb new file mode 100644 index 0000000..913807a --- /dev/null +++ b/spec/oga/xml/lexer/html_closing_rules/optgroup_spec.rb @@ -0,0 +1,22 @@ +require 'spec_helper' + +describe Oga::XML::Lexer do + describe 'HTML optgroup elements' do + describe 'with unclosed tags' do + it 'lexes an tag' do + lex_html('