Recursively closing of parent HTML elements

When closing certain HTML elements the lexer should also close whatever
parent elements remain. For example, consider the following HTML:

    <table>
        <thead>
            <tr>
                <th>Foo
                <th>Bar
        <tbody>
            ...
        </tbody>
    </table>

Here the "<tbody>" element shouldn't only close the "<th>Bar" element
but also the parent "<tr>" and "<thead>" elements. This ensures we'd end
up with the following HTML:

    <table>
        <thead>
            <tr>
                <th>Foo</th>
                <th>Bar</th>
            </tr>
        </thead>
        <tbody>
            ...
        </tbody>
    </table>

Instead of garbage along the lines of this:

    <table>
        <thead>
            <tr>
                <th>Foo</th>
                <th>Bar</th>
        <tbody>
            ...
        </tbody>
    </table></tr></thead>

Fixes #99 (hopefully for good this time)
This commit is contained in:
Yorick Peterse 2015-05-12 00:35:00 +02:00
parent 11c9b69847
commit 1e0b7feb02
3 changed files with 70 additions and 7 deletions

View File

@ -66,14 +66,14 @@ module Oga
'rp' => NodeNameSet.new(%w{rb rt rtc rp}), 'rp' => NodeNameSet.new(%w{rb rt rtc rp}),
'optgroup' => NodeNameSet.new(%w{optgroup}), 'optgroup' => NodeNameSet.new(%w{optgroup}),
'option' => NodeNameSet.new(%w{option optgroup}), 'option' => NodeNameSet.new(%w{option optgroup}),
'colgrop' => NodeNameSet.new(%w{thead tbody tfoot}), 'colgroup' => NodeNameSet.new(%w{thead tbody tfoot}),
'caption' => NodeNameSet.new(%w{thead tbody tfoot}), 'caption' => NodeNameSet.new(%w{thead tbody tfoot}),
'thead' => NodeNameSet.new(%w{tbody tfoot}), 'thead' => NodeNameSet.new(%w{thead tbody tfoot}),
'tbody' => NodeNameSet.new(%w{tbody tfoot}), 'tbody' => NodeNameSet.new(%w{thead tbody tfoot}),
'tfoot' => NodeNameSet.new(%w{tbody}), 'tfoot' => NodeNameSet.new(%w{thead tbody tfoot}),
'tr' => NodeNameSet.new(%w{tr}), 'tr' => NodeNameSet.new(%w{tr tbody thead tfoot}),
'td' => NodeNameSet.new(%w{td th}), 'td' => NodeNameSet.new(%w{td th tbody thead tfoot tr}),
'th' => NodeNameSet.new(%w{td th}), 'th' => NodeNameSet.new(%w{td th tbody thead tfoot tr}),
'p' => NodeNameSet.new(%w{ 'p' => NodeNameSet.new(%w{
address article aside blockquote div dl fieldset footer form h1 h2 h3 address article aside blockquote div dl fieldset footer form h1 h2 h3
h4 h5 h6 header hgroup hr main nav ol p pre section table ul h4 h5 h6 header hgroup hr main nav ol p pre section table ul
@ -427,6 +427,17 @@ module Oga
if close_current and close_current.include?(name) if close_current and close_current.include?(name)
on_element_end on_element_end
end end
# Close remaining parent elements. This for example ensures that a
# "<tbody>" not only closes an unclosed "<th>" but also the surrounding,
# unclosed "<tr>".
while close_current = HTML_CLOSE_SELF[current_element]
if close_current.include?(name)
on_element_end
else
break
end
end
end end
## ##

View File

@ -0,0 +1,22 @@
require 'spec_helper'
describe Oga::XML::Lexer do
describe 'HTML optgroup elements' do
describe 'with unclosed <optgroup> tags' do
it 'lexes an <option> tag followed by a <optgroup> tag' do
lex_html('<optgroup><option>foo<optgroup><option>bar').should == [
[:T_ELEM_NAME, 'optgroup', 1],
[:T_ELEM_NAME, 'option', 1],
[:T_TEXT, 'foo', 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_NAME, 'optgroup', 1],
[:T_ELEM_NAME, 'option', 1],
[:T_TEXT, 'bar', 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
end
end
end

View File

@ -0,0 +1,30 @@
require 'spec_helper'
describe Oga::XML::Lexer do
describe 'HTML tables' do
describe 'with unclosed <tr> tags' do
it 'lexes a <tr> tag followed by a <tbody> tag' do
lex_html('<tr>foo<tbody></tbody>').should == [
[:T_ELEM_NAME, 'tr', 1],
[:T_TEXT, 'foo', 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_NAME, 'tbody', 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes an unclosed <th> tag followed by a <tbody> tag' do
lex_html('<tr><th>foo<tbody>bar</tbody>').should == [
[:T_ELEM_NAME, 'tr', 1],
[:T_ELEM_NAME, 'th', 1],
[:T_TEXT, 'foo', 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_NAME, 'tbody', 1],
[:T_TEXT, 'bar', 1],
[:T_ELEM_END, nil, 1]
]
end
end
end
end