Lex/parse doctype names separately.

This commit is contained in:
Yorick Peterse 2014-04-03 21:59:57 +02:00
parent 8185656c1e
commit 81b1155af3
6 changed files with 24 additions and 17 deletions

View File

@ -281,7 +281,7 @@ module Oga
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
# 3. Legacy doctypes
#
doctype_start = '<!DOCTYPE'i whitespace+ 'HTML'i;
doctype_start = '<!DOCTYPE'i whitespace+;
action start_doctype {
emit_buffer
@ -302,6 +302,8 @@ module Oga
# including it.
whitespace;
identifier => { emit(:T_DOCTYPE_NAME) };
'>' => {
add_token(:T_DOCTYPE_END)
fret;

View File

@ -10,7 +10,7 @@
class Oga::XML::Parser
token T_STRING T_TEXT
token T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE
token T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME
token T_CDATA_START T_CDATA_END
token T_COMMENT_START T_COMMENT_END
token T_ELEM_START T_ELEM_NAME T_ELEM_NS T_ELEM_END T_ATTR
@ -43,25 +43,25 @@ rule
doctype
# <!DOCTYPE html>
: T_DOCTYPE_START T_DOCTYPE_END { s(:doctype) }
: T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_END { s(:doctype, val[1]) }
# <!DOCTYPE html PUBLIC>
| T_DOCTYPE_START T_DOCTYPE_TYPE T_DOCTYPE_END
{
s(:doctype, val[1])
}
# <!DOCTYPE html PUBLIC "foo">
| T_DOCTYPE_START T_DOCTYPE_TYPE T_STRING T_DOCTYPE_END
| T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_DOCTYPE_END
{
s(:doctype, val[1], val[2])
}
# <!DOCTYPE html PUBLIC "foo" "bar">
| T_DOCTYPE_START T_DOCTYPE_TYPE T_STRING T_STRING T_DOCTYPE_END
# <!DOCTYPE html PUBLIC "foo">
| T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_STRING T_DOCTYPE_END
{
s(:doctype, val[1], val[2], val[3])
}
# <!DOCTYPE html PUBLIC "foo" "bar">
| T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_STRING T_STRING T_DOCTYPE_END
{
s(:doctype, val[1], val[2], val[3], val[4])
}
;
# CDATA tags

View File

@ -5,6 +5,7 @@ describe Oga::XML::Lexer do
example 'lex the HTML5 doctype' do
lex('<!DOCTYPE html>').should == [
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'html', 1],
[:T_DOCTYPE_END, nil, 1]
]
end
@ -12,6 +13,7 @@ describe Oga::XML::Lexer do
example 'lex a doctype with a public and system ID' do
lex('<!DOCTYPE HTML PUBLIC "foobar" "baz">').should == [
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'HTML', 1],
[:T_DOCTYPE_TYPE, 'PUBLIC', 1],
[:T_STRING, 'foobar', 1],
[:T_STRING, 'baz', 1],
@ -22,6 +24,7 @@ describe Oga::XML::Lexer do
example 'lex a doctype with a public and system ID using single quotes' do
lex("<!DOCTYPE HTML PUBLIC 'foobar' 'baz'>").should == [
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'HTML', 1],
[:T_DOCTYPE_TYPE, 'PUBLIC', 1],
[:T_STRING, 'foobar', 1],
[:T_STRING, 'baz', 1],

View File

@ -15,6 +15,7 @@ describe Oga::XML::Lexer do
lex(html).should == [
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'html', 1],
[:T_DOCTYPE_END, nil, 1],
[:T_TEXT, "\n", 1],

View File

@ -3,27 +3,27 @@ require 'spec_helper'
describe Oga::XML::Parser do
context 'doctypes' do
example 'parse a doctype' do
parse('<!DOCTYPE html>').should == s(:document, s(:doctype))
parse('<!DOCTYPE html>').should == s(:document, s(:doctype, 'html'))
end
example 'parse a doctype with the doctype type' do
parse('<!DOCTYPE html PUBLIC>').should == s(
:document,
s(:doctype, 'PUBLIC')
s(:doctype, 'html', 'PUBLIC')
)
end
example 'parse a doctype with a public ID' do
parse('<!DOCTYPE html PUBLIC "foo">').should == s(
:document,
s(:doctype, 'PUBLIC', 'foo')
s(:doctype, 'html', 'PUBLIC', 'foo')
)
end
example 'parse a doctype with a public and private ID' do
parse('<!DOCTYPE html PUBLIC "foo" "bar">').should == s(
:document,
s(:doctype, 'PUBLIC', 'foo', 'bar')
s(:doctype, 'html', 'PUBLIC', 'foo', 'bar')
)
end
@ -35,6 +35,7 @@ describe Oga::XML::Parser do
:document,
s(
:doctype,
'HTML',
'PUBLIC',
'-//W3C//DTD HTML 4.01//EN',
'http://www.w3.org/TR/html4/strict.dtd'

View File

@ -15,7 +15,7 @@ describe Oga::XML::Parser do
parse(html).should == s(
:document,
s(:doctype),
s(:doctype, 'html'),
s(:text, "\n"),
# <html>