Lex/parse doctype names separately.

This commit is contained in:
Yorick Peterse 2014-04-03 21:59:57 +02:00
parent 8185656c1e
commit 81b1155af3
6 changed files with 24 additions and 17 deletions

View File

@ -281,7 +281,7 @@ module Oga
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5. # 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
# 3. Legacy doctypes # 3. Legacy doctypes
# #
doctype_start = '<!DOCTYPE'i whitespace+ 'HTML'i; doctype_start = '<!DOCTYPE'i whitespace+;
action start_doctype { action start_doctype {
emit_buffer emit_buffer
@ -302,6 +302,8 @@ module Oga
# including it. # including it.
whitespace; whitespace;
identifier => { emit(:T_DOCTYPE_NAME) };
'>' => { '>' => {
add_token(:T_DOCTYPE_END) add_token(:T_DOCTYPE_END)
fret; fret;

View File

@ -10,7 +10,7 @@
class Oga::XML::Parser class Oga::XML::Parser
token T_STRING T_TEXT token T_STRING T_TEXT
token T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE token T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME
token T_CDATA_START T_CDATA_END token T_CDATA_START T_CDATA_END
token T_COMMENT_START T_COMMENT_END token T_COMMENT_START T_COMMENT_END
token T_ELEM_START T_ELEM_NAME T_ELEM_NS T_ELEM_END T_ATTR token T_ELEM_START T_ELEM_NAME T_ELEM_NS T_ELEM_END T_ATTR
@ -43,25 +43,25 @@ rule
doctype doctype
# <!DOCTYPE html> # <!DOCTYPE html>
: T_DOCTYPE_START T_DOCTYPE_END { s(:doctype) } : T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_END { s(:doctype, val[1]) }
# <!DOCTYPE html PUBLIC> # <!DOCTYPE html PUBLIC>
| T_DOCTYPE_START T_DOCTYPE_TYPE T_DOCTYPE_END | T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_DOCTYPE_END
{
s(:doctype, val[1])
}
# <!DOCTYPE html PUBLIC "foo">
| T_DOCTYPE_START T_DOCTYPE_TYPE T_STRING T_DOCTYPE_END
{ {
s(:doctype, val[1], val[2]) s(:doctype, val[1], val[2])
} }
# <!DOCTYPE html PUBLIC "foo" "bar"> # <!DOCTYPE html PUBLIC "foo">
| T_DOCTYPE_START T_DOCTYPE_TYPE T_STRING T_STRING T_DOCTYPE_END | T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_STRING T_DOCTYPE_END
{ {
s(:doctype, val[1], val[2], val[3]) s(:doctype, val[1], val[2], val[3])
} }
# <!DOCTYPE html PUBLIC "foo" "bar">
| T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_STRING T_STRING T_DOCTYPE_END
{
s(:doctype, val[1], val[2], val[3], val[4])
}
; ;
# CDATA tags # CDATA tags

View File

@ -5,6 +5,7 @@ describe Oga::XML::Lexer do
example 'lex the HTML5 doctype' do example 'lex the HTML5 doctype' do
lex('<!DOCTYPE html>').should == [ lex('<!DOCTYPE html>').should == [
[:T_DOCTYPE_START, nil, 1], [:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'html', 1],
[:T_DOCTYPE_END, nil, 1] [:T_DOCTYPE_END, nil, 1]
] ]
end end
@ -12,6 +13,7 @@ describe Oga::XML::Lexer do
example 'lex a doctype with a public and system ID' do example 'lex a doctype with a public and system ID' do
lex('<!DOCTYPE HTML PUBLIC "foobar" "baz">').should == [ lex('<!DOCTYPE HTML PUBLIC "foobar" "baz">').should == [
[:T_DOCTYPE_START, nil, 1], [:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'HTML', 1],
[:T_DOCTYPE_TYPE, 'PUBLIC', 1], [:T_DOCTYPE_TYPE, 'PUBLIC', 1],
[:T_STRING, 'foobar', 1], [:T_STRING, 'foobar', 1],
[:T_STRING, 'baz', 1], [:T_STRING, 'baz', 1],
@ -22,6 +24,7 @@ describe Oga::XML::Lexer do
example 'lex a doctype with a public and system ID using single quotes' do example 'lex a doctype with a public and system ID using single quotes' do
lex("<!DOCTYPE HTML PUBLIC 'foobar' 'baz'>").should == [ lex("<!DOCTYPE HTML PUBLIC 'foobar' 'baz'>").should == [
[:T_DOCTYPE_START, nil, 1], [:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'HTML', 1],
[:T_DOCTYPE_TYPE, 'PUBLIC', 1], [:T_DOCTYPE_TYPE, 'PUBLIC', 1],
[:T_STRING, 'foobar', 1], [:T_STRING, 'foobar', 1],
[:T_STRING, 'baz', 1], [:T_STRING, 'baz', 1],

View File

@ -15,6 +15,7 @@ describe Oga::XML::Lexer do
lex(html).should == [ lex(html).should == [
[:T_DOCTYPE_START, nil, 1], [:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'html', 1],
[:T_DOCTYPE_END, nil, 1], [:T_DOCTYPE_END, nil, 1],
[:T_TEXT, "\n", 1], [:T_TEXT, "\n", 1],

View File

@ -3,27 +3,27 @@ require 'spec_helper'
describe Oga::XML::Parser do describe Oga::XML::Parser do
context 'doctypes' do context 'doctypes' do
example 'parse a doctype' do example 'parse a doctype' do
parse('<!DOCTYPE html>').should == s(:document, s(:doctype)) parse('<!DOCTYPE html>').should == s(:document, s(:doctype, 'html'))
end end
example 'parse a doctype with the doctype type' do example 'parse a doctype with the doctype type' do
parse('<!DOCTYPE html PUBLIC>').should == s( parse('<!DOCTYPE html PUBLIC>').should == s(
:document, :document,
s(:doctype, 'PUBLIC') s(:doctype, 'html', 'PUBLIC')
) )
end end
example 'parse a doctype with a public ID' do example 'parse a doctype with a public ID' do
parse('<!DOCTYPE html PUBLIC "foo">').should == s( parse('<!DOCTYPE html PUBLIC "foo">').should == s(
:document, :document,
s(:doctype, 'PUBLIC', 'foo') s(:doctype, 'html', 'PUBLIC', 'foo')
) )
end end
example 'parse a doctype with a public and private ID' do example 'parse a doctype with a public and private ID' do
parse('<!DOCTYPE html PUBLIC "foo" "bar">').should == s( parse('<!DOCTYPE html PUBLIC "foo" "bar">').should == s(
:document, :document,
s(:doctype, 'PUBLIC', 'foo', 'bar') s(:doctype, 'html', 'PUBLIC', 'foo', 'bar')
) )
end end
@ -35,6 +35,7 @@ describe Oga::XML::Parser do
:document, :document,
s( s(
:doctype, :doctype,
'HTML',
'PUBLIC', 'PUBLIC',
'-//W3C//DTD HTML 4.01//EN', '-//W3C//DTD HTML 4.01//EN',
'http://www.w3.org/TR/html4/strict.dtd' 'http://www.w3.org/TR/html4/strict.dtd'

View File

@ -15,7 +15,7 @@ describe Oga::XML::Parser do
parse(html).should == s( parse(html).should == s(
:document, :document,
s(:doctype), s(:doctype, 'html'),
s(:text, "\n"), s(:text, "\n"),
# <html> # <html>