Ignore nested element start tags

This ensures that Oga is able to tokenize input such as the following:

    <script<script>foo</script>

Oga will now treat this as:

    <script>foo</script>

This is based on libxml behaviour, which seems to differ a bit from
Chromium which treats the node as a text node. This however would
require complex look-ahead logic (as far as I can tell) that I really
don't want to implement in Oga.

Fixes #186
This commit is contained in:
Yorick Peterse 2017-12-28 16:12:20 +01:00
parent 1e002de527
commit f574197ea6
No known key found for this signature in database
GPG Key ID: EDD30D2BEB691AC9
2 changed files with 20 additions and 0 deletions

View File

@ -389,6 +389,7 @@
element_start = '<' ident_char;
element_end = '</';
element_start_pattern = '<' identifier (':' identifier)?;
# Machine used for lexing the name/namespace of an element.
element_name := |*
@ -551,6 +552,7 @@
# Machine used for processing the contents of an XML element's starting tag.
element_head := |*
newline => advance_newline;
element_start_pattern;
# Attribute names and namespaces.
identifier ':' => {
@ -578,6 +580,7 @@
# tag.
html_element_head := |*
newline => advance_newline;
element_start_pattern;
html_identifier => {
callback(id_on_attribute, data, encoding, ts, te);

View File

@ -344,4 +344,21 @@ describe Oga::XML::Lexer do
[:T_ELEM_END, nil, 1]
])
end
it 'lexes an element with a nested opening tag' do
expect(lex('<script<script>foo</script>')).to eq([
[:T_ELEM_NAME, 'script', 1],
[:T_TEXT, 'foo', 1],
[:T_ELEM_END, nil, 1]
])
end
it 'lexes an element with a nested opening tag followed by an attribute' do
expect(lex('<script<script foo>foo</script>')).to eq([
[:T_ELEM_NAME, 'script', 1],
[:T_ATTR, 'foo', 1],
[:T_TEXT, 'foo', 1],
[:T_ELEM_END, nil, 1]
])
end
end