Make PUBLIC/SYSTEM matching case-insensitive

Some websites may use "public" or "system" in doctypes, or completely
messed up casing such as PuBlIc (unlikely, but possible). This ensures
we don't care about the exact casing used.

This fixes https://gitlab.com/yorickpeterse/oga/issues/199
This commit is contained in:
Yorick Peterse 2020-01-08 03:23:46 +01:00
parent 10e9101c42
commit f00fa40e3a
No known key found for this signature in database
GPG Key ID: EDD30D2BEB691AC9
2 changed files with 28 additions and 2 deletions

View File

@ -289,7 +289,7 @@
# Machine for processing doctypes. Doctype values such as the public
# and system IDs are treated as T_STRING tokens.
doctype := |*
'PUBLIC' | 'SYSTEM' => {
'PUBLIC'i | 'SYSTEM'i => {
callback(id_on_doctype_type, data, encoding, ts, te);
};

View File

@ -108,7 +108,7 @@ describe Oga::XML::Lexer do
# Technically not valid, put in place to make sure that the Ragel rules are
# not too greedy.
it 'lexes an inline doftype followed by a system ID' do
it 'lexes an inline doctype followed by a system ID' do
expect(lex('<!DOCTYPE html [<!ELEMENT foo>] "foo">')).to eq([
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'html', 1],
@ -119,5 +119,31 @@ describe Oga::XML::Lexer do
[:T_DOCTYPE_END, nil, 1]
])
end
it 'does not care about the casing when using a public doctype' do
expect(lex('<!DoCtYpE HtMl PuBlIc [<!ELEMENT foo>] "foo">')).to eq([
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'HtMl', 1],
[:T_DOCTYPE_TYPE, 'PuBlIc', 1],
[:T_DOCTYPE_INLINE, '<!ELEMENT foo>', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, 'foo', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_DOCTYPE_END, nil, 1]
])
end
it 'does not care about the casing when using a system doctype' do
expect(lex('<!DoCtYpE HtMl SyStEm [<!ELEMENT foo>] "foo">')).to eq([
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'HtMl', 1],
[:T_DOCTYPE_TYPE, 'SyStEm', 1],
[:T_DOCTYPE_INLINE, '<!ELEMENT foo>', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, 'foo', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_DOCTYPE_END, nil, 1]
])
end
end
end