From 81b1155af3199a92c4c239b16283fee766d5dda7 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Thu, 3 Apr 2014 21:59:57 +0200 Subject: [PATCH] Lex/parse doctype names separately. --- lib/oga/xml/lexer.rl | 4 +++- lib/oga/xml/parser.y | 22 +++++++++++----------- spec/oga/xml/lexer/doctype_spec.rb | 3 +++ spec/oga/xml/lexer/documents_spec.rb | 1 + spec/oga/xml/parser/doctype_spec.rb | 9 +++++---- spec/oga/xml/parser/documents_spec.rb | 2 +- 6 files changed, 24 insertions(+), 17 deletions(-) diff --git a/lib/oga/xml/lexer.rl b/lib/oga/xml/lexer.rl index 765e235..ad5e94a 100644 --- a/lib/oga/xml/lexer.rl +++ b/lib/oga/xml/lexer.rl @@ -281,7 +281,7 @@ module Oga # 2. Deprecated doctypes, the more verbose ones used prior to HTML5. # 3. Legacy doctypes # - doctype_start = ' { emit(:T_DOCTYPE_NAME) }; + '>' => { add_token(:T_DOCTYPE_END) fret; diff --git a/lib/oga/xml/parser.y b/lib/oga/xml/parser.y index 9a1aec8..0798d71 100644 --- a/lib/oga/xml/parser.y +++ b/lib/oga/xml/parser.y @@ -10,7 +10,7 @@ class Oga::XML::Parser token T_STRING T_TEXT -token T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE +token T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME token T_CDATA_START T_CDATA_END token T_COMMENT_START T_COMMENT_END token T_ELEM_START T_ELEM_NAME T_ELEM_NS T_ELEM_END T_ATTR @@ -43,25 +43,25 @@ rule doctype # - : T_DOCTYPE_START T_DOCTYPE_END { s(:doctype) } + : T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_END { s(:doctype, val[1]) } # - | T_DOCTYPE_START T_DOCTYPE_TYPE T_DOCTYPE_END - { - s(:doctype, val[1]) - } - - # - | T_DOCTYPE_START T_DOCTYPE_TYPE T_STRING T_DOCTYPE_END + | T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_DOCTYPE_END { s(:doctype, val[1], val[2]) } - # - | T_DOCTYPE_START T_DOCTYPE_TYPE T_STRING T_STRING T_DOCTYPE_END + # + | T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_STRING T_DOCTYPE_END { s(:doctype, val[1], val[2], val[3]) } + + # + | T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_STRING T_STRING T_DOCTYPE_END + { + s(:doctype, val[1], val[2], val[3], val[4]) + } ; # CDATA tags diff --git a/spec/oga/xml/lexer/doctype_spec.rb b/spec/oga/xml/lexer/doctype_spec.rb index 666743d..f4dc872 100644 --- a/spec/oga/xml/lexer/doctype_spec.rb +++ b/spec/oga/xml/lexer/doctype_spec.rb @@ -5,6 +5,7 @@ describe Oga::XML::Lexer do example 'lex the HTML5 doctype' do lex('').should == [ [:T_DOCTYPE_START, nil, 1], + [:T_DOCTYPE_NAME, 'html', 1], [:T_DOCTYPE_END, nil, 1] ] end @@ -12,6 +13,7 @@ describe Oga::XML::Lexer do example 'lex a doctype with a public and system ID' do lex('').should == [ [:T_DOCTYPE_START, nil, 1], + [:T_DOCTYPE_NAME, 'HTML', 1], [:T_DOCTYPE_TYPE, 'PUBLIC', 1], [:T_STRING, 'foobar', 1], [:T_STRING, 'baz', 1], @@ -22,6 +24,7 @@ describe Oga::XML::Lexer do example 'lex a doctype with a public and system ID using single quotes' do lex("").should == [ [:T_DOCTYPE_START, nil, 1], + [:T_DOCTYPE_NAME, 'HTML', 1], [:T_DOCTYPE_TYPE, 'PUBLIC', 1], [:T_STRING, 'foobar', 1], [:T_STRING, 'baz', 1], diff --git a/spec/oga/xml/lexer/documents_spec.rb b/spec/oga/xml/lexer/documents_spec.rb index 9c13751..3aee616 100644 --- a/spec/oga/xml/lexer/documents_spec.rb +++ b/spec/oga/xml/lexer/documents_spec.rb @@ -15,6 +15,7 @@ describe Oga::XML::Lexer do lex(html).should == [ [:T_DOCTYPE_START, nil, 1], + [:T_DOCTYPE_NAME, 'html', 1], [:T_DOCTYPE_END, nil, 1], [:T_TEXT, "\n", 1], diff --git a/spec/oga/xml/parser/doctype_spec.rb b/spec/oga/xml/parser/doctype_spec.rb index 015f30e..9c74851 100644 --- a/spec/oga/xml/parser/doctype_spec.rb +++ b/spec/oga/xml/parser/doctype_spec.rb @@ -3,27 +3,27 @@ require 'spec_helper' describe Oga::XML::Parser do context 'doctypes' do example 'parse a doctype' do - parse('').should == s(:document, s(:doctype)) + parse('').should == s(:document, s(:doctype, 'html')) end example 'parse a doctype with the doctype type' do parse('').should == s( :document, - s(:doctype, 'PUBLIC') + s(:doctype, 'html', 'PUBLIC') ) end example 'parse a doctype with a public ID' do parse('').should == s( :document, - s(:doctype, 'PUBLIC', 'foo') + s(:doctype, 'html', 'PUBLIC', 'foo') ) end example 'parse a doctype with a public and private ID' do parse('').should == s( :document, - s(:doctype, 'PUBLIC', 'foo', 'bar') + s(:doctype, 'html', 'PUBLIC', 'foo', 'bar') ) end @@ -35,6 +35,7 @@ describe Oga::XML::Parser do :document, s( :doctype, + 'HTML', 'PUBLIC', '-//W3C//DTD HTML 4.01//EN', 'http://www.w3.org/TR/html4/strict.dtd' diff --git a/spec/oga/xml/parser/documents_spec.rb b/spec/oga/xml/parser/documents_spec.rb index b5c7192..40fa432 100644 --- a/spec/oga/xml/parser/documents_spec.rb +++ b/spec/oga/xml/parser/documents_spec.rb @@ -15,7 +15,7 @@ describe Oga::XML::Parser do parse(html).should == s( :document, - s(:doctype), + s(:doctype, 'html'), s(:text, "\n"), #