diff --git a/lib/oga/lexer.rl b/lib/oga/lexer.rl index 0518f9e..c41dc35 100644 --- a/lib/oga/lexer.rl +++ b/lib/oga/lexer.rl @@ -83,35 +83,48 @@ module Oga lbracket = '['; rbracket = ']'; - s_quote = "'"; - d_quote = '"'; + s_quote = "'"; + d_quote = '"'; # FIXME: there really should be a better way of doing this. text = (any - s_quote - d_quote - equals - bang - slash - greater - smaller - whitespace - newline - colon - dash - lbracket - rbracket)+; - # Unicode characters, taken from whitequark's wonderful parser library. - # (I honestly need to buy that dude a beer or 100). Basically this - # takes all characters and removes ASCII ones from the list, thus - # leaving you with Unicode. - unicode = any - ascii; + # DOCTYPES + # + # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax + # + # Doctypes are treated with some extra care on lexer level to make the + # parser's life easier. If they were treated as regular text it would be + # a pain to specify a proper doctype in Racc since it can't match on a + # token's value (only on its type). + # + # Doctype parsing is also relaxed compared to the W3 specification. For + # example, the specification defines 4 doctype formats each having + # different rules. Because Oga doesn't really use the doctype for + # anything we'll just slap all the formats into a single rule. Easy + # enough. + doctype = smaller whitespace* bang whitespace* 'DOCTYPE'i whitespace* + 'HTML'i whitespace* any* greater; main := |* whitespace => { t(:T_SPACE) }; newline => { t(:T_NEWLINE); advance_line }; - smaller => { t(:T_SMALLER) }; - greater => { t(:T_GREATER) }; - slash => { t(:T_SLASH) }; - d_quote => { t(:T_DQUOTE) }; - s_quote => { t(:T_SQUOTE) }; - dash => { t(:T_DASH) }; - rbracket => { t(:T_RBRACKET) }; - lbracket => { t(:T_LBRACKET) }; - colon => { t(:T_COLON) }; - bang => { t(:T_BANG) }; - equals => { t(:T_EQUALS) }; - text => { t(:T_TEXT) }; + + doctype => { t(:T_DOCTYPE) }; + smaller => { t(:T_SMALLER) }; + greater => { t(:T_GREATER) }; + slash => { t(:T_SLASH) }; + d_quote => { t(:T_DQUOTE) }; + s_quote => { t(:T_SQUOTE) }; + dash => { t(:T_DASH) }; + rbracket => { t(:T_RBRACKET) }; + lbracket => { t(:T_LBRACKET) }; + colon => { t(:T_COLON) }; + bang => { t(:T_BANG) }; + equals => { t(:T_EQUALS) }; + text => { t(:T_TEXT) }; *|; }%% end # Lexer diff --git a/lib/oga/parser/html.y b/lib/oga/parser/html.y index 11ccde7..ada4bbd 100644 --- a/lib/oga/parser/html.y +++ b/lib/oga/parser/html.y @@ -2,7 +2,7 @@ class Oga::Parser::HTML token T_SPACE T_NEWLINE T_SMALLER T_GREATER T_SLASH token T_DQUOTE T_SQUOTE T_DASH T_RBRACKET T_LBRACKET -token T_COLON T_BANG T_EQUALS T_TEXT +token T_COLON T_BANG T_EQUALS T_TEXT T_DOCTYPE options no_result_var @@ -19,8 +19,17 @@ rule expression : tag + | doctype ; + # Doctypes + + doctype + : T_DOCTYPE { s(:doctype, val[0]) } + ; + + # Generic HTML tags + tag_start #
: T_SMALLER T_TEXT T_GREATER { val[1] } @@ -42,6 +51,16 @@ rule tag_body : T_TEXT ; + + whitespaces + : whitespaces whitespace + | whitespace + ; + + whitespace + : T_NEWLINE + | T_SPACE + ; end ---- inner diff --git a/spec/oga/lexer_spec.rb b/spec/oga/lexer_spec.rb index ca99113..9c0ed68 100644 --- a/spec/oga/lexer_spec.rb +++ b/spec/oga/lexer_spec.rb @@ -132,4 +132,18 @@ describe Oga::Lexer do ] end end + + context 'doctypes' do + example 'lex the HTML5 doctype' do + lex('').should == [ + [:T_DOCTYPE, '', 1, 1] + ] + end + + example 'lex a random doctype' do + lex('').should == [ + [:T_DOCTYPE, '', 1, 1] + ] + end + end end diff --git a/spec/oga/parser/html/doctype_spec.rb b/spec/oga/parser/html/doctype_spec.rb new file mode 100644 index 0000000..be85d22 --- /dev/null +++ b/spec/oga/parser/html/doctype_spec.rb @@ -0,0 +1,18 @@ +require 'spec_helper' + +describe Oga::Parser::HTML do + context 'doctypes' do + example 'parse the HTML5 doctype' do + doctype = '' + + parse_html(doctype).should == s( :document, s(:doctype, doctype)) + end + + example 'parse an HTML 4 strict doctype' do + doctype = '' + + parse_html(doctype).should == s(:document, s(:doctype, doctype)) + end + end +end diff --git a/spec/support/parsing.rb b/spec/support/parsing.rb index 5da78df..72c81d5 100644 --- a/spec/support/parsing.rb +++ b/spec/support/parsing.rb @@ -20,5 +20,15 @@ module Oga def lex(input) return Oga::Lexer.new.lex(input) end + + ## + # Parses the given HTML and returns an AST. + # + # @param [String] input + # @return [Oga::AST::Node] + # + def parse_html(input) + return Oga::Parser::HTML.new.parse(input) + end end # ParsingHelpers end # Oga