Use blacklists/whitelists for HTML closing rules

This allows for more fine grained control over when to close certain
elements. For example, an unclosed <tr> element should be closed first
when bumping into any element other than <td> or <th>. Using the old
NodeNameSet this would mean having to list every possible HTML element
out there. Using this new setup one can just create a whitelist of the
<td> and <th> elements.
This commit is contained in:
Yorick Peterse 2015-05-18 00:32:29 +02:00
parent 5a74571536
commit 688a1fff0e
1 changed files with 31 additions and 32 deletions

View File

@ -45,39 +45,38 @@ module Oga
HTML_SCRIPT = 'script'.freeze HTML_SCRIPT = 'script'.freeze
HTML_STYLE = 'style'.freeze HTML_STYLE = 'style'.freeze
# Elements that are allowed directly in a <table> element.
HTML_TABLE_ALLOWED = Whitelist.new(
%w{thead tbody tfoot tr caption colgroup col}
)
# Elements that should be closed automatically before a new opening tag is # Elements that should be closed automatically before a new opening tag is
# processed. # processed.
HTML_CLOSE_SELF = { HTML_CLOSE_SELF = {
'html' => NodeNameSet.new(%w{html}), 'head' => Blacklist.new(%w{head body}),
'head' => NodeNameSet.new(%w{head body}), 'body' => Blacklist.new(%w{head body}),
'body' => NodeNameSet.new(%w{body head}), 'li' => Blacklist.new(%w{li}),
'base' => NodeNameSet.new(%w{base}), 'dt' => Blacklist.new(%w{dt dd}),
'link' => NodeNameSet.new(%w{link}), 'dd' => Blacklist.new(%w{dt dd}),
'meta' => NodeNameSet.new(%w{meta}), 'p' => Blacklist.new(%w{
'noscript' => NodeNameSet.new(%w{noscript}),
'template' => NodeNameSet.new(%w{template}),
'title' => NodeNameSet.new(%w{title}),
'li' => NodeNameSet.new(%w{li}),
'dt' => NodeNameSet.new(%w{dt dd}),
'dd' => NodeNameSet.new(%w{dd dt}),
'rb' => NodeNameSet.new(%w{rb rt rtc rp}),
'rt' => NodeNameSet.new(%w{rb rt rtc rp}),
'rtc' => NodeNameSet.new(%w{rb rtc rp}),
'rp' => NodeNameSet.new(%w{rb rt rtc rp}),
'optgroup' => NodeNameSet.new(%w{optgroup}),
'option' => NodeNameSet.new(%w{option optgroup}),
'colgroup' => NodeNameSet.new(%w{thead tbody tfoot colgroup tr}),
'caption' => NodeNameSet.new(%w{thead tbody tfoot colgroup tr caption}),
'thead' => NodeNameSet.new(%w{thead tbody tfoot}),
'tbody' => NodeNameSet.new(%w{thead tbody tfoot}),
'tfoot' => NodeNameSet.new(%w{thead tbody tfoot}),
'tr' => NodeNameSet.new(%w{tr tbody thead tfoot}),
'td' => NodeNameSet.new(%w{td th tbody thead tfoot tr}),
'th' => NodeNameSet.new(%w{td th tbody thead tfoot tr}),
'p' => NodeNameSet.new(%w{
address article aside blockquote div dl fieldset footer form h1 h2 h3 address article aside blockquote div dl fieldset footer form h1 h2 h3
h4 h5 h6 header hgroup hr main nav ol p pre section table ul h4 h5 h6 header hgroup hr main nav ol p pre section table ul
}) }),
'rb' => Blacklist.new(%w{rb rt rtc rp}),
'rt' => Blacklist.new(%w{rb rt rtc rp}),
'rtc' => Blacklist.new(%w{rb rtc}),
'rp' => Blacklist.new(%w{rb rt rtc rp}),
'optgroup' => Blacklist.new(%w{optgroup}),
'option' => Blacklist.new(%w{optgroup option}),
'colgroup' => Whitelist.new(%w{col template}),
'caption' => HTML_TABLE_ALLOWED.to_blacklist,
'table' => HTML_TABLE_ALLOWED,
'thead' => Whitelist.new(%w{tr}),
'tbody' => Whitelist.new(%w{tr}),
'tfoot' => Whitelist.new(%w{tr}),
'tr' => Whitelist.new(%w{td th}),
'td' => Blacklist.new(%w{td th}) + HTML_TABLE_ALLOWED,
'th' => Blacklist.new(%w{td th}) + HTML_TABLE_ALLOWED
} }
HTML_CLOSE_SELF.keys.each do |key| HTML_CLOSE_SELF.keys.each do |key|
@ -424,7 +423,7 @@ module Oga
def before_html_element_name(name) def before_html_element_name(name)
close_current = HTML_CLOSE_SELF[current_element] close_current = HTML_CLOSE_SELF[current_element]
if close_current and close_current.include?(name) if close_current and !close_current.allow?(name)
on_element_end on_element_end
end end
@ -432,10 +431,10 @@ module Oga
# "<tbody>" not only closes an unclosed "<th>" but also the surrounding, # "<tbody>" not only closes an unclosed "<th>" but also the surrounding,
# unclosed "<tr>". # unclosed "<tr>".
while close_current = HTML_CLOSE_SELF[current_element] while close_current = HTML_CLOSE_SELF[current_element]
if close_current.include?(name) if close_current.allow?(name)
on_element_end
else
break break
else
on_element_end
end end
end end
end end