Automatically closing of certain HTML tags
This ensures that HTML such as this: <li>foo <li>bar is parsed as this: <li>foo</li> <li>bar</li> and not as this: <li> foo <li>bar</li> </li> Fixes #97
This commit is contained in:
parent
4b21a2fadc
commit
4b1c296936
|
@ -40,18 +40,52 @@ module Oga
|
||||||
class Lexer
|
class Lexer
|
||||||
attr_reader :html
|
attr_reader :html
|
||||||
|
|
||||||
# @return [String]
|
# These are all constant/frozen to remove the need for String allocations
|
||||||
HTML_SCRIPT = 'script'
|
# every time they are referenced in the lexer.
|
||||||
|
HTML_SCRIPT = 'script'.freeze
|
||||||
|
HTML_STYLE = 'style'.freeze
|
||||||
|
|
||||||
# @return [String]
|
# Elements that should be closed automatically before a new opening tag is
|
||||||
HTML_STYLE = 'style'
|
# processed.
|
||||||
|
HTML_CLOSE_SELF = {
|
||||||
|
'html' => NodeNameSet.new(%w{html}),
|
||||||
|
'head' => NodeNameSet.new(%w{head}),
|
||||||
|
'body' => NodeNameSet.new(%w{body}),
|
||||||
|
'base' => NodeNameSet.new(%w{base}),
|
||||||
|
'link' => NodeNameSet.new(%w{link}),
|
||||||
|
'meta' => NodeNameSet.new(%w{meta}),
|
||||||
|
'noscript' => NodeNameSet.new(%w{noscript}),
|
||||||
|
'template' => NodeNameSet.new(%w{template}),
|
||||||
|
'title' => NodeNameSet.new(%w{title}),
|
||||||
|
'li' => NodeNameSet.new(%w{li}),
|
||||||
|
'dt' => NodeNameSet.new(%w{dt dd}),
|
||||||
|
'dd' => NodeNameSet.new(%w{dd dt}),
|
||||||
|
'rb' => NodeNameSet.new(%w{rb rt rtc rp}),
|
||||||
|
'rt' => NodeNameSet.new(%w{rb rt rtc rp}),
|
||||||
|
'rtc' => NodeNameSet.new(%w{rb rtc rp}),
|
||||||
|
'rp' => NodeNameSet.new(%w{rb rt rtc rp}),
|
||||||
|
'optgroup' => NodeNameSet.new(%w{optgroup}),
|
||||||
|
'option' => NodeNameSet.new(%w{option optgroup}),
|
||||||
|
'thead' => NodeNameSet.new(%w{tbody tfoot}),
|
||||||
|
'tbody' => NodeNameSet.new(%w{tbody tfoot}),
|
||||||
|
'tfoot' => NodeNameSet.new(%w{tbody}),
|
||||||
|
'tr' => NodeNameSet.new(%w{tr}),
|
||||||
|
'td' => NodeNameSet.new(%w{td th}),
|
||||||
|
'th' => NodeNameSet.new(%w{td th}),
|
||||||
|
'p' => NodeNameSet.new(%w{
|
||||||
|
address article aside blockquote div dl fieldset footer form h1 h2 h3
|
||||||
|
h4 h5 h6 header hgroup hr main nav ol p pre section table ul
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
HTML_CLOSE_SELF.keys.each do |key|
|
||||||
|
HTML_CLOSE_SELF[key.upcase] = HTML_CLOSE_SELF[key]
|
||||||
|
end
|
||||||
|
|
||||||
##
|
##
|
||||||
# Names of HTML tags of which the content should be lexed as-is.
|
# Names of HTML tags of which the content should be lexed as-is.
|
||||||
#
|
#
|
||||||
# @return [Array]
|
LITERAL_HTML_ELEMENTS = NodeNameSet.new([HTML_SCRIPT, HTML_STYLE])
|
||||||
#
|
|
||||||
LITERAL_HTML_ELEMENTS = [HTML_SCRIPT, HTML_STYLE]
|
|
||||||
|
|
||||||
##
|
##
|
||||||
# @param [String|IO] data The data to lex. This can either be a String or
|
# @param [String|IO] data The data to lex. This can either be a String or
|
||||||
|
@ -375,6 +409,28 @@ module Oga
|
||||||
# @param [String] name The name of the element, including namespace.
|
# @param [String] name The name of the element, including namespace.
|
||||||
#
|
#
|
||||||
def on_element_name(name)
|
def on_element_name(name)
|
||||||
|
before_html_element_name(name) if html?
|
||||||
|
|
||||||
|
add_element(name)
|
||||||
|
end
|
||||||
|
|
||||||
|
##
|
||||||
|
# Handles inserting of any missing tags whenever a new HTML tag is opened.
|
||||||
|
#
|
||||||
|
# @param [String] name
|
||||||
|
#
|
||||||
|
def before_html_element_name(name)
|
||||||
|
close_current = HTML_CLOSE_SELF[current_element]
|
||||||
|
|
||||||
|
if close_current and close_current.include?(name)
|
||||||
|
on_element_end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
##
|
||||||
|
# @param [String] name
|
||||||
|
#
|
||||||
|
def add_element(name)
|
||||||
@elements << name
|
@elements << name
|
||||||
|
|
||||||
add_token(:T_ELEM_NAME, name)
|
add_token(:T_ELEM_NAME, name)
|
||||||
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
require 'spec_helper'
|
||||||
|
|
||||||
|
describe Oga::XML::Lexer do
|
||||||
|
described_class::HTML_CLOSE_SELF.each do |element, terminals|
|
||||||
|
describe "lexing <#{element}> tags" do
|
||||||
|
terminals.each do |term|
|
||||||
|
it "automatically closes a <#{element}> followed by a <#{term}>" do
|
||||||
|
lex_html("<#{element}><#{term}>").should == [
|
||||||
|
[:T_ELEM_NAME, element, 1],
|
||||||
|
[:T_ELEM_END, nil, 1],
|
||||||
|
[:T_ELEM_NAME, term, 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in New Issue