From f574197ea657cf09405336ca618a22e32c94d0d0 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Thu, 28 Dec 2017 16:12:20 +0100 Subject: [PATCH] Ignore nested element start tags This ensures that Oga is able to tokenize input such as the following: foo Oga will now treat this as: This is based on libxml behaviour, which seems to differ a bit from Chromium which treats the node as a text node. This however would require complex look-ahead logic (as far as I can tell) that I really don't want to implement in Oga. Fixes #186 --- ext/ragel/base_lexer.rl | 3 +++ spec/oga/xml/lexer/elements_spec.rb | 17 +++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index 0e552b4..f6a80a2 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -389,6 +389,7 @@ element_start = '<' ident_char; element_end = ' advance_newline; + element_start_pattern; # Attribute names and namespaces. identifier ':' => { @@ -578,6 +580,7 @@ # tag. html_element_head := |* newline => advance_newline; + element_start_pattern; html_identifier => { callback(id_on_attribute, data, encoding, ts, te); diff --git a/spec/oga/xml/lexer/elements_spec.rb b/spec/oga/xml/lexer/elements_spec.rb index 7caefce..4832869 100644 --- a/spec/oga/xml/lexer/elements_spec.rb +++ b/spec/oga/xml/lexer/elements_spec.rb @@ -344,4 +344,21 @@ describe Oga::XML::Lexer do [:T_ELEM_END, nil, 1] ]) end + + it 'lexes an element with a nested opening tag' do + expect(lex('foo')).to eq([ + [:T_ELEM_NAME, 'script', 1], + [:T_TEXT, 'foo', 1], + [:T_ELEM_END, nil, 1] + ]) + end + + it 'lexes an element with a nested opening tag followed by an attribute' do + expect(lex('foo')).to eq([ + [:T_ELEM_NAME, 'script', 1], + [:T_ATTR, 'foo', 1], + [:T_TEXT, 'foo', 1], + [:T_ELEM_END, nil, 1] + ]) + end end