Automatically closing of certain HTML tags

This ensures that HTML such as this:

    <li>foo
    <li>bar

is parsed as this:

    <li>foo</li>
    <li>bar</li>

and not as this:

    <li>
        foo
        <li>bar</li>
    </li>

Fixes #97
This commit is contained in:
Yorick Peterse 2015-04-22 01:29:28 +02:00
parent 4b21a2fadc
commit 4b1c296936
2 changed files with 81 additions and 7 deletions

View File

@ -40,18 +40,52 @@ module Oga
class Lexer class Lexer
attr_reader :html attr_reader :html
# @return [String] # These are all constant/frozen to remove the need for String allocations
HTML_SCRIPT = 'script' # every time they are referenced in the lexer.
HTML_SCRIPT = 'script'.freeze
HTML_STYLE = 'style'.freeze
# @return [String] # Elements that should be closed automatically before a new opening tag is
HTML_STYLE = 'style' # processed.
HTML_CLOSE_SELF = {
'html' => NodeNameSet.new(%w{html}),
'head' => NodeNameSet.new(%w{head}),
'body' => NodeNameSet.new(%w{body}),
'base' => NodeNameSet.new(%w{base}),
'link' => NodeNameSet.new(%w{link}),
'meta' => NodeNameSet.new(%w{meta}),
'noscript' => NodeNameSet.new(%w{noscript}),
'template' => NodeNameSet.new(%w{template}),
'title' => NodeNameSet.new(%w{title}),
'li' => NodeNameSet.new(%w{li}),
'dt' => NodeNameSet.new(%w{dt dd}),
'dd' => NodeNameSet.new(%w{dd dt}),
'rb' => NodeNameSet.new(%w{rb rt rtc rp}),
'rt' => NodeNameSet.new(%w{rb rt rtc rp}),
'rtc' => NodeNameSet.new(%w{rb rtc rp}),
'rp' => NodeNameSet.new(%w{rb rt rtc rp}),
'optgroup' => NodeNameSet.new(%w{optgroup}),
'option' => NodeNameSet.new(%w{option optgroup}),
'thead' => NodeNameSet.new(%w{tbody tfoot}),
'tbody' => NodeNameSet.new(%w{tbody tfoot}),
'tfoot' => NodeNameSet.new(%w{tbody}),
'tr' => NodeNameSet.new(%w{tr}),
'td' => NodeNameSet.new(%w{td th}),
'th' => NodeNameSet.new(%w{td th}),
'p' => NodeNameSet.new(%w{
address article aside blockquote div dl fieldset footer form h1 h2 h3
h4 h5 h6 header hgroup hr main nav ol p pre section table ul
})
}
HTML_CLOSE_SELF.keys.each do |key|
HTML_CLOSE_SELF[key.upcase] = HTML_CLOSE_SELF[key]
end
## ##
# Names of HTML tags of which the content should be lexed as-is. # Names of HTML tags of which the content should be lexed as-is.
# #
# @return [Array] LITERAL_HTML_ELEMENTS = NodeNameSet.new([HTML_SCRIPT, HTML_STYLE])
#
LITERAL_HTML_ELEMENTS = [HTML_SCRIPT, HTML_STYLE]
## ##
# @param [String|IO] data The data to lex. This can either be a String or # @param [String|IO] data The data to lex. This can either be a String or
@ -375,6 +409,28 @@ module Oga
# @param [String] name The name of the element, including namespace. # @param [String] name The name of the element, including namespace.
# #
def on_element_name(name) def on_element_name(name)
before_html_element_name(name) if html?
add_element(name)
end
##
# Handles inserting of any missing tags whenever a new HTML tag is opened.
#
# @param [String] name
#
def before_html_element_name(name)
close_current = HTML_CLOSE_SELF[current_element]
if close_current and close_current.include?(name)
on_element_end
end
end
##
# @param [String] name
#
def add_element(name)
@elements << name @elements << name
add_token(:T_ELEM_NAME, name) add_token(:T_ELEM_NAME, name)

View File

@ -0,0 +1,18 @@
require 'spec_helper'
describe Oga::XML::Lexer do
described_class::HTML_CLOSE_SELF.each do |element, terminals|
describe "lexing <#{element}> tags" do
terminals.each do |term|
it "automatically closes a <#{element}> followed by a <#{term}>" do
lex_html("<#{element}><#{term}>").should == [
[:T_ELEM_NAME, element, 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_NAME, term, 1],
[:T_ELEM_END, nil, 1]
]
end
end
end
end
end