Basic lexing + parsing of doctypes.

We're doing these the lazy way. I can't be bothered writing patterns/rules for
4 different formats for something such as doctypes.
This commit is contained in:
Yorick Peterse 2014-02-27 01:27:43 +01:00
parent d7d20b4c23
commit 2c82f88f6c
5 changed files with 94 additions and 20 deletions

View File

@ -83,35 +83,48 @@ module Oga
lbracket = '[';
rbracket = ']';
s_quote = "'";
d_quote = '"';
s_quote = "'";
d_quote = '"';
# FIXME: there really should be a better way of doing this.
text = (any - s_quote - d_quote - equals - bang - slash -
greater - smaller - whitespace - newline - colon - dash -
lbracket - rbracket)+;
# Unicode characters, taken from whitequark's wonderful parser library.
# (I honestly need to buy that dude a beer or 100). Basically this
# takes all characters and removes ASCII ones from the list, thus
# leaving you with Unicode.
unicode = any - ascii;
# DOCTYPES
#
# http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
#
# Doctypes are treated with some extra care on lexer level to make the
# parser's life easier. If they were treated as regular text it would be
# a pain to specify a proper doctype in Racc since it can't match on a
# token's value (only on its type).
#
# Doctype parsing is also relaxed compared to the W3 specification. For
# example, the specification defines 4 doctype formats each having
# different rules. Because Oga doesn't really use the doctype for
# anything we'll just slap all the formats into a single rule. Easy
# enough.
doctype = smaller whitespace* bang whitespace* 'DOCTYPE'i whitespace*
'HTML'i whitespace* any* greater;
main := |*
whitespace => { t(:T_SPACE) };
newline => { t(:T_NEWLINE); advance_line };
smaller => { t(:T_SMALLER) };
greater => { t(:T_GREATER) };
slash => { t(:T_SLASH) };
d_quote => { t(:T_DQUOTE) };
s_quote => { t(:T_SQUOTE) };
dash => { t(:T_DASH) };
rbracket => { t(:T_RBRACKET) };
lbracket => { t(:T_LBRACKET) };
colon => { t(:T_COLON) };
bang => { t(:T_BANG) };
equals => { t(:T_EQUALS) };
text => { t(:T_TEXT) };
doctype => { t(:T_DOCTYPE) };
smaller => { t(:T_SMALLER) };
greater => { t(:T_GREATER) };
slash => { t(:T_SLASH) };
d_quote => { t(:T_DQUOTE) };
s_quote => { t(:T_SQUOTE) };
dash => { t(:T_DASH) };
rbracket => { t(:T_RBRACKET) };
lbracket => { t(:T_LBRACKET) };
colon => { t(:T_COLON) };
bang => { t(:T_BANG) };
equals => { t(:T_EQUALS) };
text => { t(:T_TEXT) };
*|;
}%%
end # Lexer

View File

@ -2,7 +2,7 @@ class Oga::Parser::HTML
token T_SPACE T_NEWLINE T_SMALLER T_GREATER T_SLASH
token T_DQUOTE T_SQUOTE T_DASH T_RBRACKET T_LBRACKET
token T_COLON T_BANG T_EQUALS T_TEXT
token T_COLON T_BANG T_EQUALS T_TEXT T_DOCTYPE
options no_result_var
@ -19,8 +19,17 @@ rule
expression
: tag
| doctype
;
# Doctypes
doctype
: T_DOCTYPE { s(:doctype, val[0]) }
;
# Generic HTML tags
tag_start
# <p>
: T_SMALLER T_TEXT T_GREATER { val[1] }
@ -42,6 +51,16 @@ rule
tag_body
: T_TEXT
;
whitespaces
: whitespaces whitespace
| whitespace
;
whitespace
: T_NEWLINE
| T_SPACE
;
end
---- inner

View File

@ -132,4 +132,18 @@ describe Oga::Lexer do
]
end
end
context 'doctypes' do
example 'lex the HTML5 doctype' do
lex('<!DOCTYPE html>').should == [
[:T_DOCTYPE, '<!DOCTYPE html>', 1, 1]
]
end
example 'lex a random doctype' do
lex('<!DOCTYPE HTML PUBLIC "foobar" "baz">').should == [
[:T_DOCTYPE, '<!DOCTYPE HTML PUBLIC "foobar" "baz">', 1, 1]
]
end
end
end

View File

@ -0,0 +1,18 @@
require 'spec_helper'
describe Oga::Parser::HTML do
context 'doctypes' do
example 'parse the HTML5 doctype' do
doctype = '<!DOCTYPE html>'
parse_html(doctype).should == s( :document, s(:doctype, doctype))
end
example 'parse an HTML 4 strict doctype' do
doctype = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" ' \
'"http://www.w3.org/TR/html4/strict.dtd">'
parse_html(doctype).should == s(:document, s(:doctype, doctype))
end
end
end

View File

@ -20,5 +20,15 @@ module Oga
def lex(input)
return Oga::Lexer.new.lex(input)
end
##
# Parses the given HTML and returns an AST.
#
# @param [String] input
# @return [Oga::AST::Node]
#
def parse_html(input)
return Oga::Parser::HTML.new.parse(input)
end
end # ParsingHelpers
end # Oga