From f574197ea657cf09405336ca618a22e32c94d0d0 Mon Sep 17 00:00:00 2001
From: Yorick Peterse <yorickpeterse@gmail.com>
Date: Thu, 28 Dec 2017 16:12:20 +0100
Subject: [PATCH] Ignore nested element start tags

This ensures that Oga is able to tokenize input such as the following:

    <script<script>foo</script>

Oga will now treat this as:

    <script>foo</script>

This is based on libxml behaviour, which seems to differ a bit from
Chromium which treats the node as a text node. This however would
require complex look-ahead logic (as far as I can tell) that I really
don't want to implement in Oga.

Fixes #186
---
 ext/ragel/base_lexer.rl             |  3 +++
 spec/oga/xml/lexer/elements_spec.rb | 17 +++++++++++++++++
 2 files changed, 20 insertions(+)
diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl
index 0e552b4..f6a80a2 100644
--- a/ext/ragel/base_lexer.rl
+++ b/ext/ragel/base_lexer.rl
@@ -389,6 +389,7 @@
 
     element_start = '<' ident_char;
     element_end   = '</';
+    element_start_pattern = '<' identifier (':' identifier)?;
 
     # Machine used for lexing the name/namespace of an element.
     element_name := |*
@@ -551,6 +552,7 @@
     # Machine used for processing the contents of an XML element's starting tag.
     element_head := |*
         newline => advance_newline;
+        element_start_pattern;
 
         # Attribute names and namespaces.
         identifier ':' => {
@@ -578,6 +580,7 @@
     # tag.
     html_element_head := |*
         newline => advance_newline;
+        element_start_pattern;
 
         html_identifier => {
             callback(id_on_attribute, data, encoding, ts, te);
diff --git a/spec/oga/xml/lexer/elements_spec.rb b/spec/oga/xml/lexer/elements_spec.rb
index 7caefce..4832869 100644
--- a/spec/oga/xml/lexer/elements_spec.rb
+++ b/spec/oga/xml/lexer/elements_spec.rb
@@ -344,4 +344,21 @@ describe Oga::XML::Lexer do
       [:T_ELEM_END, nil, 1]
     ])
   end
+
+  it 'lexes an element with a nested opening tag' do
+    expect(lex('<script<script>foo</script>')).to eq([
+      [:T_ELEM_NAME, 'script', 1],
+      [:T_TEXT, 'foo', 1],
+      [:T_ELEM_END, nil, 1]
+    ])
+  end
+
+  it 'lexes an element with a nested opening tag followed by an attribute' do
+    expect(lex('<script<script foo>foo</script>')).to eq([
+      [:T_ELEM_NAME, 'script', 1],
+      [:T_ATTR, 'foo', 1],
+      [:T_TEXT, 'foo', 1],
+      [:T_ELEM_END, nil, 1]
+    ])
+  end
 end