Correct closing of unclosed, nested HTML elements
Previous HTML such as this would be lexed incorrectly: <div> <ul> <li>foo </ul> inside div </div> outside div The lexer would see this as the following instead: <div> <ul> <li>foo</li> inside div </ul> outside div </div> This commit exposes the name of the closing tag to XML::Lexer#on_element_end (omitted for self closing tags). This can be used to automatically close nested tags that were left open, ensuring the above HTML is lexer correctly. The new setup ignores namespace prefixes as these are not used in HTML, XML in turn won't even run the code to begin with since it doesn't allow one to leave out closing tags.
This commit is contained in:
parent
8172de192c
commit
5182d0c488
|
@ -363,16 +363,15 @@
|
||||||
# body of an element is lexed using the `main` machine.
|
# body of an element is lexed using the `main` machine.
|
||||||
#
|
#
|
||||||
|
|
||||||
element_start = '<' ident_char;
|
|
||||||
element_end = '</' identifier (':' identifier)* '>';
|
|
||||||
|
|
||||||
action start_element {
|
action start_element {
|
||||||
fhold;
|
fhold;
|
||||||
fnext element_name;
|
fnext element_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
action close_element {
|
action close_element {
|
||||||
callback_simple(id_on_element_end);
|
callback(id_on_element_end, data, encoding, mark, te - 1);
|
||||||
|
|
||||||
|
mark = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
action close_element_fnext_main {
|
action close_element_fnext_main {
|
||||||
|
@ -381,6 +380,12 @@
|
||||||
fnext main;
|
fnext main;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
element_start = '<' ident_char;
|
||||||
|
|
||||||
|
element_end = '</' %{ mark = p; } identifier '>'
|
||||||
|
| '</' identifier ':' %{ mark = p; } identifier '>'
|
||||||
|
;
|
||||||
|
|
||||||
# Machine used for lexing the name/namespace of an element.
|
# Machine used for lexing the name/namespace of an element.
|
||||||
element_name := |*
|
element_name := |*
|
||||||
identifier ':' => {
|
identifier ':' => {
|
||||||
|
|
|
@ -476,9 +476,19 @@ module Oga
|
||||||
##
|
##
|
||||||
# Called on the closing tag of an element.
|
# Called on the closing tag of an element.
|
||||||
#
|
#
|
||||||
def on_element_end
|
# @param [String] ns_name The name of the element (minus namespace
|
||||||
|
# prefix). This is not set for self closing tags.
|
||||||
|
#
|
||||||
|
def on_element_end(name = nil)
|
||||||
return if @elements.empty?
|
return if @elements.empty?
|
||||||
|
|
||||||
|
if html? and name and @elements.include?(name)
|
||||||
|
while current_element != name
|
||||||
|
add_token(:T_ELEM_END)
|
||||||
|
@elements.pop
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
add_token(:T_ELEM_END)
|
add_token(:T_ELEM_END)
|
||||||
|
|
||||||
@elements.pop
|
@elements.pop
|
||||||
|
|
|
@ -0,0 +1,13 @@
|
||||||
|
require 'spec_helper'
|
||||||
|
|
||||||
|
describe Oga::XML::Lexer do
|
||||||
|
describe 'closing HTML elements with mismatched closing tags' do
|
||||||
|
it 'lexes a <p> element closed using a </div> element' do
|
||||||
|
lex_html('<p>foo</div>').should == [
|
||||||
|
[:T_ELEM_NAME, 'p', 1],
|
||||||
|
[:T_TEXT, 'foo', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,49 @@
|
||||||
|
require 'spec_helper'
|
||||||
|
|
||||||
|
describe Oga::XML::Lexer do
|
||||||
|
describe 'using HTML <ul> elements' do
|
||||||
|
it 'lexes an <ul> element containing unclosed <li> elements with text' do
|
||||||
|
lex_html('<ul><li>foo<li>bar</ul>').should == [
|
||||||
|
[:T_ELEM_NAME, 'ul', 1],
|
||||||
|
[:T_ELEM_NAME, 'li', 1],
|
||||||
|
[:T_TEXT, 'foo', 1],
|
||||||
|
[:T_ELEM_END, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'li', 1],
|
||||||
|
[:T_TEXT, 'bar', 1],
|
||||||
|
[:T_ELEM_END, nil, 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'lexes an <ul> element followed by text containing unclosed <li> elements with text' do
|
||||||
|
lex_html('<ul><li>foo<li>bar</ul>outside ul').should == [
|
||||||
|
[:T_ELEM_NAME, 'ul', 1],
|
||||||
|
[:T_ELEM_NAME, 'li', 1],
|
||||||
|
[:T_TEXT, 'foo', 1],
|
||||||
|
[:T_ELEM_END, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'li', 1],
|
||||||
|
[:T_TEXT, 'bar', 1],
|
||||||
|
[:T_ELEM_END, nil, 1],
|
||||||
|
[:T_ELEM_END, nil, 1],
|
||||||
|
[:T_TEXT, 'outside ul', 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'lexes nested <ul> elements containing unclosed <li> elements' do
|
||||||
|
lex_html('<ul><li><ul><li>foo</ul><li>bar</ul>').should == [
|
||||||
|
[:T_ELEM_NAME, 'ul', 1],
|
||||||
|
[:T_ELEM_NAME, 'li', 1],
|
||||||
|
[:T_ELEM_NAME, 'ul', 1],
|
||||||
|
[:T_ELEM_NAME, 'li', 1],
|
||||||
|
[:T_TEXT, 'foo', 1],
|
||||||
|
[:T_ELEM_END, nil, 1],
|
||||||
|
[:T_ELEM_END, nil, 1],
|
||||||
|
[:T_ELEM_END, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'li', 1],
|
||||||
|
[:T_TEXT, 'bar', 1],
|
||||||
|
[:T_ELEM_END, nil, 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in New Issue