Use blacklists/whitelists for HTML closing rules
This allows for more fine grained control over when to close certain elements. For example, an unclosed <tr> element should be closed first when bumping into any element other than <td> or <th>. Using the old NodeNameSet this would mean having to list every possible HTML element out there. Using this new setup one can just create a whitelist of the <td> and <th> elements.
This commit is contained in:
parent
5a74571536
commit
688a1fff0e
|
@ -45,39 +45,38 @@ module Oga
|
||||||
HTML_SCRIPT = 'script'.freeze
|
HTML_SCRIPT = 'script'.freeze
|
||||||
HTML_STYLE = 'style'.freeze
|
HTML_STYLE = 'style'.freeze
|
||||||
|
|
||||||
|
# Elements that are allowed directly in a <table> element.
|
||||||
|
HTML_TABLE_ALLOWED = Whitelist.new(
|
||||||
|
%w{thead tbody tfoot tr caption colgroup col}
|
||||||
|
)
|
||||||
|
|
||||||
# Elements that should be closed automatically before a new opening tag is
|
# Elements that should be closed automatically before a new opening tag is
|
||||||
# processed.
|
# processed.
|
||||||
HTML_CLOSE_SELF = {
|
HTML_CLOSE_SELF = {
|
||||||
'html' => NodeNameSet.new(%w{html}),
|
'head' => Blacklist.new(%w{head body}),
|
||||||
'head' => NodeNameSet.new(%w{head body}),
|
'body' => Blacklist.new(%w{head body}),
|
||||||
'body' => NodeNameSet.new(%w{body head}),
|
'li' => Blacklist.new(%w{li}),
|
||||||
'base' => NodeNameSet.new(%w{base}),
|
'dt' => Blacklist.new(%w{dt dd}),
|
||||||
'link' => NodeNameSet.new(%w{link}),
|
'dd' => Blacklist.new(%w{dt dd}),
|
||||||
'meta' => NodeNameSet.new(%w{meta}),
|
'p' => Blacklist.new(%w{
|
||||||
'noscript' => NodeNameSet.new(%w{noscript}),
|
|
||||||
'template' => NodeNameSet.new(%w{template}),
|
|
||||||
'title' => NodeNameSet.new(%w{title}),
|
|
||||||
'li' => NodeNameSet.new(%w{li}),
|
|
||||||
'dt' => NodeNameSet.new(%w{dt dd}),
|
|
||||||
'dd' => NodeNameSet.new(%w{dd dt}),
|
|
||||||
'rb' => NodeNameSet.new(%w{rb rt rtc rp}),
|
|
||||||
'rt' => NodeNameSet.new(%w{rb rt rtc rp}),
|
|
||||||
'rtc' => NodeNameSet.new(%w{rb rtc rp}),
|
|
||||||
'rp' => NodeNameSet.new(%w{rb rt rtc rp}),
|
|
||||||
'optgroup' => NodeNameSet.new(%w{optgroup}),
|
|
||||||
'option' => NodeNameSet.new(%w{option optgroup}),
|
|
||||||
'colgroup' => NodeNameSet.new(%w{thead tbody tfoot colgroup tr}),
|
|
||||||
'caption' => NodeNameSet.new(%w{thead tbody tfoot colgroup tr caption}),
|
|
||||||
'thead' => NodeNameSet.new(%w{thead tbody tfoot}),
|
|
||||||
'tbody' => NodeNameSet.new(%w{thead tbody tfoot}),
|
|
||||||
'tfoot' => NodeNameSet.new(%w{thead tbody tfoot}),
|
|
||||||
'tr' => NodeNameSet.new(%w{tr tbody thead tfoot}),
|
|
||||||
'td' => NodeNameSet.new(%w{td th tbody thead tfoot tr}),
|
|
||||||
'th' => NodeNameSet.new(%w{td th tbody thead tfoot tr}),
|
|
||||||
'p' => NodeNameSet.new(%w{
|
|
||||||
address article aside blockquote div dl fieldset footer form h1 h2 h3
|
address article aside blockquote div dl fieldset footer form h1 h2 h3
|
||||||
h4 h5 h6 header hgroup hr main nav ol p pre section table ul
|
h4 h5 h6 header hgroup hr main nav ol p pre section table ul
|
||||||
})
|
}),
|
||||||
|
'rb' => Blacklist.new(%w{rb rt rtc rp}),
|
||||||
|
'rt' => Blacklist.new(%w{rb rt rtc rp}),
|
||||||
|
'rtc' => Blacklist.new(%w{rb rtc}),
|
||||||
|
'rp' => Blacklist.new(%w{rb rt rtc rp}),
|
||||||
|
'optgroup' => Blacklist.new(%w{optgroup}),
|
||||||
|
'option' => Blacklist.new(%w{optgroup option}),
|
||||||
|
'colgroup' => Whitelist.new(%w{col template}),
|
||||||
|
'caption' => HTML_TABLE_ALLOWED.to_blacklist,
|
||||||
|
'table' => HTML_TABLE_ALLOWED,
|
||||||
|
'thead' => Whitelist.new(%w{tr}),
|
||||||
|
'tbody' => Whitelist.new(%w{tr}),
|
||||||
|
'tfoot' => Whitelist.new(%w{tr}),
|
||||||
|
'tr' => Whitelist.new(%w{td th}),
|
||||||
|
'td' => Blacklist.new(%w{td th}) + HTML_TABLE_ALLOWED,
|
||||||
|
'th' => Blacklist.new(%w{td th}) + HTML_TABLE_ALLOWED
|
||||||
}
|
}
|
||||||
|
|
||||||
HTML_CLOSE_SELF.keys.each do |key|
|
HTML_CLOSE_SELF.keys.each do |key|
|
||||||
|
@ -424,7 +423,7 @@ module Oga
|
||||||
def before_html_element_name(name)
|
def before_html_element_name(name)
|
||||||
close_current = HTML_CLOSE_SELF[current_element]
|
close_current = HTML_CLOSE_SELF[current_element]
|
||||||
|
|
||||||
if close_current and close_current.include?(name)
|
if close_current and !close_current.allow?(name)
|
||||||
on_element_end
|
on_element_end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -432,10 +431,10 @@ module Oga
|
||||||
# "<tbody>" not only closes an unclosed "<th>" but also the surrounding,
|
# "<tbody>" not only closes an unclosed "<th>" but also the surrounding,
|
||||||
# unclosed "<tr>".
|
# unclosed "<tr>".
|
||||||
while close_current = HTML_CLOSE_SELF[current_element]
|
while close_current = HTML_CLOSE_SELF[current_element]
|
||||||
if close_current.include?(name)
|
if close_current.allow?(name)
|
||||||
on_element_end
|
|
||||||
else
|
|
||||||
break
|
break
|
||||||
|
else
|
||||||
|
on_element_end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue