Basic lexing + parsing of doctypes.

We're doing these the lazy way. I can't be bothered writing patterns/rules for
4 different formats for something such as doctypes.
This commit is contained in:
Yorick Peterse 2014-02-27 01:27:43 +01:00
parent d7d20b4c23
commit 2c82f88f6c
5 changed files with 94 additions and 20 deletions

View File

@ -83,35 +83,48 @@ module Oga
lbracket = '['; lbracket = '[';
rbracket = ']'; rbracket = ']';
s_quote = "'"; s_quote = "'";
d_quote = '"'; d_quote = '"';
# FIXME: there really should be a better way of doing this. # FIXME: there really should be a better way of doing this.
text = (any - s_quote - d_quote - equals - bang - slash - text = (any - s_quote - d_quote - equals - bang - slash -
greater - smaller - whitespace - newline - colon - dash - greater - smaller - whitespace - newline - colon - dash -
lbracket - rbracket)+; lbracket - rbracket)+;
# Unicode characters, taken from whitequark's wonderful parser library. # DOCTYPES
# (I honestly need to buy that dude a beer or 100). Basically this #
# takes all characters and removes ASCII ones from the list, thus # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
# leaving you with Unicode. #
unicode = any - ascii; # Doctypes are treated with some extra care on lexer level to make the
# parser's life easier. If they were treated as regular text it would be
# a pain to specify a proper doctype in Racc since it can't match on a
# token's value (only on its type).
#
# Doctype parsing is also relaxed compared to the W3 specification. For
# example, the specification defines 4 doctype formats each having
# different rules. Because Oga doesn't really use the doctype for
# anything we'll just slap all the formats into a single rule. Easy
# enough.
doctype = smaller whitespace* bang whitespace* 'DOCTYPE'i whitespace*
'HTML'i whitespace* any* greater;
main := |* main := |*
whitespace => { t(:T_SPACE) }; whitespace => { t(:T_SPACE) };
newline => { t(:T_NEWLINE); advance_line }; newline => { t(:T_NEWLINE); advance_line };
smaller => { t(:T_SMALLER) };
greater => { t(:T_GREATER) }; doctype => { t(:T_DOCTYPE) };
slash => { t(:T_SLASH) }; smaller => { t(:T_SMALLER) };
d_quote => { t(:T_DQUOTE) }; greater => { t(:T_GREATER) };
s_quote => { t(:T_SQUOTE) }; slash => { t(:T_SLASH) };
dash => { t(:T_DASH) }; d_quote => { t(:T_DQUOTE) };
rbracket => { t(:T_RBRACKET) }; s_quote => { t(:T_SQUOTE) };
lbracket => { t(:T_LBRACKET) }; dash => { t(:T_DASH) };
colon => { t(:T_COLON) }; rbracket => { t(:T_RBRACKET) };
bang => { t(:T_BANG) }; lbracket => { t(:T_LBRACKET) };
equals => { t(:T_EQUALS) }; colon => { t(:T_COLON) };
text => { t(:T_TEXT) }; bang => { t(:T_BANG) };
equals => { t(:T_EQUALS) };
text => { t(:T_TEXT) };
*|; *|;
}%% }%%
end # Lexer end # Lexer

View File

@ -2,7 +2,7 @@ class Oga::Parser::HTML
token T_SPACE T_NEWLINE T_SMALLER T_GREATER T_SLASH token T_SPACE T_NEWLINE T_SMALLER T_GREATER T_SLASH
token T_DQUOTE T_SQUOTE T_DASH T_RBRACKET T_LBRACKET token T_DQUOTE T_SQUOTE T_DASH T_RBRACKET T_LBRACKET
token T_COLON T_BANG T_EQUALS T_TEXT token T_COLON T_BANG T_EQUALS T_TEXT T_DOCTYPE
options no_result_var options no_result_var
@ -19,8 +19,17 @@ rule
expression expression
: tag : tag
| doctype
; ;
# Doctypes
doctype
: T_DOCTYPE { s(:doctype, val[0]) }
;
# Generic HTML tags
tag_start tag_start
# <p> # <p>
: T_SMALLER T_TEXT T_GREATER { val[1] } : T_SMALLER T_TEXT T_GREATER { val[1] }
@ -42,6 +51,16 @@ rule
tag_body tag_body
: T_TEXT : T_TEXT
; ;
whitespaces
: whitespaces whitespace
| whitespace
;
whitespace
: T_NEWLINE
| T_SPACE
;
end end
---- inner ---- inner

View File

@ -132,4 +132,18 @@ describe Oga::Lexer do
] ]
end end
end end
context 'doctypes' do
example 'lex the HTML5 doctype' do
lex('<!DOCTYPE html>').should == [
[:T_DOCTYPE, '<!DOCTYPE html>', 1, 1]
]
end
example 'lex a random doctype' do
lex('<!DOCTYPE HTML PUBLIC "foobar" "baz">').should == [
[:T_DOCTYPE, '<!DOCTYPE HTML PUBLIC "foobar" "baz">', 1, 1]
]
end
end
end end

View File

@ -0,0 +1,18 @@
require 'spec_helper'
describe Oga::Parser::HTML do
context 'doctypes' do
example 'parse the HTML5 doctype' do
doctype = '<!DOCTYPE html>'
parse_html(doctype).should == s( :document, s(:doctype, doctype))
end
example 'parse an HTML 4 strict doctype' do
doctype = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" ' \
'"http://www.w3.org/TR/html4/strict.dtd">'
parse_html(doctype).should == s(:document, s(:doctype, doctype))
end
end
end

View File

@ -20,5 +20,15 @@ module Oga
def lex(input) def lex(input)
return Oga::Lexer.new.lex(input) return Oga::Lexer.new.lex(input)
end end
##
# Parses the given HTML and returns an AST.
#
# @param [String] input
# @return [Oga::AST::Node]
#
def parse_html(input)
return Oga::Parser::HTML.new.parse(input)
end
end # ParsingHelpers end # ParsingHelpers
end # Oga end # Oga