From f00fa40e3aae13861b50fd0f55e4f8488581b5c1 Mon Sep 17 00:00:00 2001
From: Yorick Peterse <yorick@yorickpeterse.com>
Date: Wed, 8 Jan 2020 03:23:46 +0100
Subject: [PATCH] Make PUBLIC/SYSTEM matching case-insensitive

Some websites may use "public" or "system" in doctypes, or completely
messed up casing such as PuBlIc (unlikely, but possible). This ensures
we don't care about the exact casing used.

This fixes https://gitlab.com/yorickpeterse/oga/issues/199
---
 ext/ragel/base_lexer.rl            |  2 +-
 spec/oga/xml/lexer/doctype_spec.rb | 28 +++++++++++++++++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl
index f6a80a2..1aa09f5 100644
--- a/ext/ragel/base_lexer.rl
+++ b/ext/ragel/base_lexer.rl
@@ -289,7 +289,7 @@
     # Machine for processing doctypes. Doctype values such as the public
     # and system IDs are treated as T_STRING tokens.
     doctype := |*
-        'PUBLIC' | 'SYSTEM' => {
+        'PUBLIC'i | 'SYSTEM'i => {
             callback(id_on_doctype_type, data, encoding, ts, te);
         };
 
diff --git a/spec/oga/xml/lexer/doctype_spec.rb b/spec/oga/xml/lexer/doctype_spec.rb
index 7fbe8d1..14735ea 100644
--- a/spec/oga/xml/lexer/doctype_spec.rb
+++ b/spec/oga/xml/lexer/doctype_spec.rb
@@ -108,7 +108,7 @@ describe Oga::XML::Lexer do
 
     # Technically not valid, put in place to make sure that the Ragel rules are
     # not too greedy.
-    it 'lexes an inline doftype followed by a system ID' do
+    it 'lexes an inline doctype followed by a system ID' do
       expect(lex('<!DOCTYPE html [<!ELEMENT foo>] "foo">')).to eq([
         [:T_DOCTYPE_START, nil, 1],
         [:T_DOCTYPE_NAME, 'html', 1],
@@ -119,5 +119,31 @@ describe Oga::XML::Lexer do
         [:T_DOCTYPE_END, nil, 1]
       ])
     end
+
+    it 'does not care about the casing when using a public doctype' do
+      expect(lex('<!DoCtYpE HtMl PuBlIc [<!ELEMENT foo>] "foo">')).to eq([
+        [:T_DOCTYPE_START, nil, 1],
+        [:T_DOCTYPE_NAME, 'HtMl', 1],
+        [:T_DOCTYPE_TYPE, 'PuBlIc', 1],
+        [:T_DOCTYPE_INLINE, '<!ELEMENT foo>', 1],
+        [:T_STRING_DQUOTE, nil, 1],
+        [:T_STRING_BODY, 'foo', 1],
+        [:T_STRING_DQUOTE, nil, 1],
+        [:T_DOCTYPE_END, nil, 1]
+      ])
+    end
+
+    it 'does not care about the casing when using a system doctype' do
+      expect(lex('<!DoCtYpE HtMl SyStEm [<!ELEMENT foo>] "foo">')).to eq([
+        [:T_DOCTYPE_START, nil, 1],
+        [:T_DOCTYPE_NAME, 'HtMl', 1],
+        [:T_DOCTYPE_TYPE, 'SyStEm', 1],
+        [:T_DOCTYPE_INLINE, '<!ELEMENT foo>', 1],
+        [:T_STRING_DQUOTE, nil, 1],
+        [:T_STRING_BODY, 'foo', 1],
+        [:T_STRING_DQUOTE, nil, 1],
+        [:T_DOCTYPE_END, nil, 1]
+      ])
+    end
   end
 end