Recursively closing of parent HTML elements

When closing certain HTML elements the lexer should also close whatever
parent elements remain. For example, consider the following HTML:

    <table>
        <thead>
            <tr>
                <th>Foo
                <th>Bar
        <tbody>
            ...
        </tbody>
    </table>

Here the "<tbody>" element shouldn't only close the "<th>Bar" element
but also the parent "<tr>" and "<thead>" elements. This ensures we'd end
up with the following HTML:

    <table>
        <thead>
            <tr>
                <th>Foo</th>
                <th>Bar</th>
            </tr>
        </thead>
        <tbody>
            ...
        </tbody>
    </table>

Instead of garbage along the lines of this:

    <table>
        <thead>
            <tr>
                <th>Foo</th>
                <th>Bar</th>
        <tbody>
            ...
        </tbody>
    </table></tr></thead>

Fixes #99 (hopefully for good this time)
This commit is contained in:
Yorick Peterse 2015-05-12 00:35:00 +02:00
parent 11c9b69847
commit 1e0b7feb02
3 changed files with 70 additions and 7 deletions

View File

@ -66,14 +66,14 @@ module Oga
'rp' => NodeNameSet.new(%w{rb rt rtc rp}),
'optgroup' => NodeNameSet.new(%w{optgroup}),
'option' => NodeNameSet.new(%w{option optgroup}),
'colgrop' => NodeNameSet.new(%w{thead tbody tfoot}),
'colgroup' => NodeNameSet.new(%w{thead tbody tfoot}),
'caption' => NodeNameSet.new(%w{thead tbody tfoot}),
'thead' => NodeNameSet.new(%w{tbody tfoot}),
'tbody' => NodeNameSet.new(%w{tbody tfoot}),
'tfoot' => NodeNameSet.new(%w{tbody}),
'tr' => NodeNameSet.new(%w{tr}),
'td' => NodeNameSet.new(%w{td th}),
'th' => NodeNameSet.new(%w{td th}),
'thead' => NodeNameSet.new(%w{thead tbody tfoot}),
'tbody' => NodeNameSet.new(%w{thead tbody tfoot}),
'tfoot' => NodeNameSet.new(%w{thead tbody tfoot}),
'tr' => NodeNameSet.new(%w{tr tbody thead tfoot}),
'td' => NodeNameSet.new(%w{td th tbody thead tfoot tr}),
'th' => NodeNameSet.new(%w{td th tbody thead tfoot tr}),
'p' => NodeNameSet.new(%w{
address article aside blockquote div dl fieldset footer form h1 h2 h3
h4 h5 h6 header hgroup hr main nav ol p pre section table ul
@ -427,6 +427,17 @@ module Oga
if close_current and close_current.include?(name)
on_element_end
end
# Close remaining parent elements. This for example ensures that a
# "<tbody>" not only closes an unclosed "<th>" but also the surrounding,
# unclosed "<tr>".
while close_current = HTML_CLOSE_SELF[current_element]
if close_current.include?(name)
on_element_end
else
break
end
end
end
##

View File

@ -0,0 +1,22 @@
require 'spec_helper'
describe Oga::XML::Lexer do
describe 'HTML optgroup elements' do
describe 'with unclosed <optgroup> tags' do
it 'lexes an <option> tag followed by a <optgroup> tag' do
lex_html('<optgroup><option>foo<optgroup><option>bar').should == [
[:T_ELEM_NAME, 'optgroup', 1],
[:T_ELEM_NAME, 'option', 1],
[:T_TEXT, 'foo', 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_NAME, 'optgroup', 1],
[:T_ELEM_NAME, 'option', 1],
[:T_TEXT, 'bar', 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
end
end
end

View File

@ -0,0 +1,30 @@
require 'spec_helper'
describe Oga::XML::Lexer do
describe 'HTML tables' do
describe 'with unclosed <tr> tags' do
it 'lexes a <tr> tag followed by a <tbody> tag' do
lex_html('<tr>foo<tbody></tbody>').should == [
[:T_ELEM_NAME, 'tr', 1],
[:T_TEXT, 'foo', 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_NAME, 'tbody', 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes an unclosed <th> tag followed by a <tbody> tag' do
lex_html('<tr><th>foo<tbody>bar</tbody>').should == [
[:T_ELEM_NAME, 'tr', 1],
[:T_ELEM_NAME, 'th', 1],
[:T_TEXT, 'foo', 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_NAME, 'tbody', 1],
[:T_TEXT, 'bar', 1],
[:T_ELEM_END, nil, 1]
]
end
end
end
end