Recursively closing of parent HTML elements
When closing certain HTML elements the lexer should also close whatever parent elements remain. For example, consider the following HTML: <table> <thead> <tr> <th>Foo <th>Bar <tbody> ... </tbody> </table> Here the "<tbody>" element shouldn't only close the "<th>Bar" element but also the parent "<tr>" and "<thead>" elements. This ensures we'd end up with the following HTML: <table> <thead> <tr> <th>Foo</th> <th>Bar</th> </tr> </thead> <tbody> ... </tbody> </table> Instead of garbage along the lines of this: <table> <thead> <tr> <th>Foo</th> <th>Bar</th> <tbody> ... </tbody> </table></tr></thead> Fixes #99 (hopefully for good this time)
This commit is contained in:
parent
11c9b69847
commit
1e0b7feb02
|
@ -66,14 +66,14 @@ module Oga
|
||||||
'rp' => NodeNameSet.new(%w{rb rt rtc rp}),
|
'rp' => NodeNameSet.new(%w{rb rt rtc rp}),
|
||||||
'optgroup' => NodeNameSet.new(%w{optgroup}),
|
'optgroup' => NodeNameSet.new(%w{optgroup}),
|
||||||
'option' => NodeNameSet.new(%w{option optgroup}),
|
'option' => NodeNameSet.new(%w{option optgroup}),
|
||||||
'colgrop' => NodeNameSet.new(%w{thead tbody tfoot}),
|
'colgroup' => NodeNameSet.new(%w{thead tbody tfoot}),
|
||||||
'caption' => NodeNameSet.new(%w{thead tbody tfoot}),
|
'caption' => NodeNameSet.new(%w{thead tbody tfoot}),
|
||||||
'thead' => NodeNameSet.new(%w{tbody tfoot}),
|
'thead' => NodeNameSet.new(%w{thead tbody tfoot}),
|
||||||
'tbody' => NodeNameSet.new(%w{tbody tfoot}),
|
'tbody' => NodeNameSet.new(%w{thead tbody tfoot}),
|
||||||
'tfoot' => NodeNameSet.new(%w{tbody}),
|
'tfoot' => NodeNameSet.new(%w{thead tbody tfoot}),
|
||||||
'tr' => NodeNameSet.new(%w{tr}),
|
'tr' => NodeNameSet.new(%w{tr tbody thead tfoot}),
|
||||||
'td' => NodeNameSet.new(%w{td th}),
|
'td' => NodeNameSet.new(%w{td th tbody thead tfoot tr}),
|
||||||
'th' => NodeNameSet.new(%w{td th}),
|
'th' => NodeNameSet.new(%w{td th tbody thead tfoot tr}),
|
||||||
'p' => NodeNameSet.new(%w{
|
'p' => NodeNameSet.new(%w{
|
||||||
address article aside blockquote div dl fieldset footer form h1 h2 h3
|
address article aside blockquote div dl fieldset footer form h1 h2 h3
|
||||||
h4 h5 h6 header hgroup hr main nav ol p pre section table ul
|
h4 h5 h6 header hgroup hr main nav ol p pre section table ul
|
||||||
|
@ -427,6 +427,17 @@ module Oga
|
||||||
if close_current and close_current.include?(name)
|
if close_current and close_current.include?(name)
|
||||||
on_element_end
|
on_element_end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Close remaining parent elements. This for example ensures that a
|
||||||
|
# "<tbody>" not only closes an unclosed "<th>" but also the surrounding,
|
||||||
|
# unclosed "<tr>".
|
||||||
|
while close_current = HTML_CLOSE_SELF[current_element]
|
||||||
|
if close_current.include?(name)
|
||||||
|
on_element_end
|
||||||
|
else
|
||||||
|
break
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
##
|
##
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
require 'spec_helper'
|
||||||
|
|
||||||
|
describe Oga::XML::Lexer do
|
||||||
|
describe 'HTML optgroup elements' do
|
||||||
|
describe 'with unclosed <optgroup> tags' do
|
||||||
|
it 'lexes an <option> tag followed by a <optgroup> tag' do
|
||||||
|
lex_html('<optgroup><option>foo<optgroup><option>bar').should == [
|
||||||
|
[:T_ELEM_NAME, 'optgroup', 1],
|
||||||
|
[:T_ELEM_NAME, 'option', 1],
|
||||||
|
[:T_TEXT, 'foo', 1],
|
||||||
|
[:T_ELEM_END, nil, 1],
|
||||||
|
[:T_ELEM_END, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'optgroup', 1],
|
||||||
|
[:T_ELEM_NAME, 'option', 1],
|
||||||
|
[:T_TEXT, 'bar', 1],
|
||||||
|
[:T_ELEM_END, nil, 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,30 @@
|
||||||
|
require 'spec_helper'
|
||||||
|
|
||||||
|
describe Oga::XML::Lexer do
|
||||||
|
describe 'HTML tables' do
|
||||||
|
describe 'with unclosed <tr> tags' do
|
||||||
|
it 'lexes a <tr> tag followed by a <tbody> tag' do
|
||||||
|
lex_html('<tr>foo<tbody></tbody>').should == [
|
||||||
|
[:T_ELEM_NAME, 'tr', 1],
|
||||||
|
[:T_TEXT, 'foo', 1],
|
||||||
|
[:T_ELEM_END, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'tbody', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'lexes an unclosed <th> tag followed by a <tbody> tag' do
|
||||||
|
lex_html('<tr><th>foo<tbody>bar</tbody>').should == [
|
||||||
|
[:T_ELEM_NAME, 'tr', 1],
|
||||||
|
[:T_ELEM_NAME, 'th', 1],
|
||||||
|
[:T_TEXT, 'foo', 1],
|
||||||
|
[:T_ELEM_END, nil, 1],
|
||||||
|
[:T_ELEM_END, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'tbody', 1],
|
||||||
|
[:T_TEXT, 'bar', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in New Issue