Correct closing of unclosed, nested HTML elements

Previous HTML such as this would be lexed incorrectly:

    <div>
        <ul>
            <li>foo
        </ul>
        inside div
    </div>
    outside div

The lexer would see this as the following instead:

    <div>
        <ul>
            <li>foo</li>
            inside div
        </ul>
    outside div
    </div>

This commit exposes the name of the closing tag to
XML::Lexer#on_element_end (omitted for self closing tags). This can be
used to automatically close nested tags that were left open, ensuring
the above HTML is lexer correctly.

The new setup ignores namespace prefixes as these are not used in HTML,
XML in turn won't even run the code to begin with since it doesn't allow
one to leave out closing tags.
This commit is contained in:
Yorick Peterse 2015-05-23 09:59:50 +02:00
parent 8172de192c
commit 5182d0c488
4 changed files with 82 additions and 5 deletions

View File

@ -363,16 +363,15 @@
# body of an element is lexed using the `main` machine. # body of an element is lexed using the `main` machine.
# #
element_start = '<' ident_char;
element_end = '</' identifier (':' identifier)* '>';
action start_element { action start_element {
fhold; fhold;
fnext element_name; fnext element_name;
} }
action close_element { action close_element {
callback_simple(id_on_element_end); callback(id_on_element_end, data, encoding, mark, te - 1);
mark = 0;
} }
action close_element_fnext_main { action close_element_fnext_main {
@ -381,6 +380,12 @@
fnext main; fnext main;
} }
element_start = '<' ident_char;
element_end = '</' %{ mark = p; } identifier '>'
| '</' identifier ':' %{ mark = p; } identifier '>'
;
# Machine used for lexing the name/namespace of an element. # Machine used for lexing the name/namespace of an element.
element_name := |* element_name := |*
identifier ':' => { identifier ':' => {

View File

@ -476,9 +476,19 @@ module Oga
## ##
# Called on the closing tag of an element. # Called on the closing tag of an element.
# #
def on_element_end # @param [String] ns_name The name of the element (minus namespace
# prefix). This is not set for self closing tags.
#
def on_element_end(name = nil)
return if @elements.empty? return if @elements.empty?
if html? and name and @elements.include?(name)
while current_element != name
add_token(:T_ELEM_END)
@elements.pop
end
end
add_token(:T_ELEM_END) add_token(:T_ELEM_END)
@elements.pop @elements.pop

View File

@ -0,0 +1,13 @@
require 'spec_helper'
describe Oga::XML::Lexer do
describe 'closing HTML elements with mismatched closing tags' do
it 'lexes a <p> element closed using a </div> element' do
lex_html('<p>foo</div>').should == [
[:T_ELEM_NAME, 'p', 1],
[:T_TEXT, 'foo', 1],
[:T_ELEM_END, nil, 1]
]
end
end
end

View File

@ -0,0 +1,49 @@
require 'spec_helper'
describe Oga::XML::Lexer do
describe 'using HTML <ul> elements' do
it 'lexes an <ul> element containing unclosed <li> elements with text' do
lex_html('<ul><li>foo<li>bar</ul>').should == [
[:T_ELEM_NAME, 'ul', 1],
[:T_ELEM_NAME, 'li', 1],
[:T_TEXT, 'foo', 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_NAME, 'li', 1],
[:T_TEXT, 'bar', 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes an <ul> element followed by text containing unclosed <li> elements with text' do
lex_html('<ul><li>foo<li>bar</ul>outside ul').should == [
[:T_ELEM_NAME, 'ul', 1],
[:T_ELEM_NAME, 'li', 1],
[:T_TEXT, 'foo', 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_NAME, 'li', 1],
[:T_TEXT, 'bar', 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_END, nil, 1],
[:T_TEXT, 'outside ul', 1]
]
end
it 'lexes nested <ul> elements containing unclosed <li> elements' do
lex_html('<ul><li><ul><li>foo</ul><li>bar</ul>').should == [
[:T_ELEM_NAME, 'ul', 1],
[:T_ELEM_NAME, 'li', 1],
[:T_ELEM_NAME, 'ul', 1],
[:T_ELEM_NAME, 'li', 1],
[:T_TEXT, 'foo', 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_NAME, 'li', 1],
[:T_TEXT, 'bar', 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
end
end