Basic lexing + parsing of doctypes.
We're doing these the lazy way. I can't be bothered writing patterns/rules for 4 different formats for something such as doctypes.
This commit is contained in:
parent
d7d20b4c23
commit
2c82f88f6c
|
@ -83,35 +83,48 @@ module Oga
|
|||
lbracket = '[';
|
||||
rbracket = ']';
|
||||
|
||||
s_quote = "'";
|
||||
d_quote = '"';
|
||||
s_quote = "'";
|
||||
d_quote = '"';
|
||||
|
||||
# FIXME: there really should be a better way of doing this.
|
||||
text = (any - s_quote - d_quote - equals - bang - slash -
|
||||
greater - smaller - whitespace - newline - colon - dash -
|
||||
lbracket - rbracket)+;
|
||||
|
||||
# Unicode characters, taken from whitequark's wonderful parser library.
|
||||
# (I honestly need to buy that dude a beer or 100). Basically this
|
||||
# takes all characters and removes ASCII ones from the list, thus
|
||||
# leaving you with Unicode.
|
||||
unicode = any - ascii;
|
||||
# DOCTYPES
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
|
||||
#
|
||||
# Doctypes are treated with some extra care on lexer level to make the
|
||||
# parser's life easier. If they were treated as regular text it would be
|
||||
# a pain to specify a proper doctype in Racc since it can't match on a
|
||||
# token's value (only on its type).
|
||||
#
|
||||
# Doctype parsing is also relaxed compared to the W3 specification. For
|
||||
# example, the specification defines 4 doctype formats each having
|
||||
# different rules. Because Oga doesn't really use the doctype for
|
||||
# anything we'll just slap all the formats into a single rule. Easy
|
||||
# enough.
|
||||
doctype = smaller whitespace* bang whitespace* 'DOCTYPE'i whitespace*
|
||||
'HTML'i whitespace* any* greater;
|
||||
|
||||
main := |*
|
||||
whitespace => { t(:T_SPACE) };
|
||||
newline => { t(:T_NEWLINE); advance_line };
|
||||
smaller => { t(:T_SMALLER) };
|
||||
greater => { t(:T_GREATER) };
|
||||
slash => { t(:T_SLASH) };
|
||||
d_quote => { t(:T_DQUOTE) };
|
||||
s_quote => { t(:T_SQUOTE) };
|
||||
dash => { t(:T_DASH) };
|
||||
rbracket => { t(:T_RBRACKET) };
|
||||
lbracket => { t(:T_LBRACKET) };
|
||||
colon => { t(:T_COLON) };
|
||||
bang => { t(:T_BANG) };
|
||||
equals => { t(:T_EQUALS) };
|
||||
text => { t(:T_TEXT) };
|
||||
|
||||
doctype => { t(:T_DOCTYPE) };
|
||||
smaller => { t(:T_SMALLER) };
|
||||
greater => { t(:T_GREATER) };
|
||||
slash => { t(:T_SLASH) };
|
||||
d_quote => { t(:T_DQUOTE) };
|
||||
s_quote => { t(:T_SQUOTE) };
|
||||
dash => { t(:T_DASH) };
|
||||
rbracket => { t(:T_RBRACKET) };
|
||||
lbracket => { t(:T_LBRACKET) };
|
||||
colon => { t(:T_COLON) };
|
||||
bang => { t(:T_BANG) };
|
||||
equals => { t(:T_EQUALS) };
|
||||
text => { t(:T_TEXT) };
|
||||
*|;
|
||||
}%%
|
||||
end # Lexer
|
||||
|
|
|
@ -2,7 +2,7 @@ class Oga::Parser::HTML
|
|||
|
||||
token T_SPACE T_NEWLINE T_SMALLER T_GREATER T_SLASH
|
||||
token T_DQUOTE T_SQUOTE T_DASH T_RBRACKET T_LBRACKET
|
||||
token T_COLON T_BANG T_EQUALS T_TEXT
|
||||
token T_COLON T_BANG T_EQUALS T_TEXT T_DOCTYPE
|
||||
|
||||
options no_result_var
|
||||
|
||||
|
@ -19,8 +19,17 @@ rule
|
|||
|
||||
expression
|
||||
: tag
|
||||
| doctype
|
||||
;
|
||||
|
||||
# Doctypes
|
||||
|
||||
doctype
|
||||
: T_DOCTYPE { s(:doctype, val[0]) }
|
||||
;
|
||||
|
||||
# Generic HTML tags
|
||||
|
||||
tag_start
|
||||
# <p>
|
||||
: T_SMALLER T_TEXT T_GREATER { val[1] }
|
||||
|
@ -42,6 +51,16 @@ rule
|
|||
tag_body
|
||||
: T_TEXT
|
||||
;
|
||||
|
||||
whitespaces
|
||||
: whitespaces whitespace
|
||||
| whitespace
|
||||
;
|
||||
|
||||
whitespace
|
||||
: T_NEWLINE
|
||||
| T_SPACE
|
||||
;
|
||||
end
|
||||
|
||||
---- inner
|
||||
|
|
|
@ -132,4 +132,18 @@ describe Oga::Lexer do
|
|||
]
|
||||
end
|
||||
end
|
||||
|
||||
context 'doctypes' do
|
||||
example 'lex the HTML5 doctype' do
|
||||
lex('<!DOCTYPE html>').should == [
|
||||
[:T_DOCTYPE, '<!DOCTYPE html>', 1, 1]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex a random doctype' do
|
||||
lex('<!DOCTYPE HTML PUBLIC "foobar" "baz">').should == [
|
||||
[:T_DOCTYPE, '<!DOCTYPE HTML PUBLIC "foobar" "baz">', 1, 1]
|
||||
]
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::Parser::HTML do
|
||||
context 'doctypes' do
|
||||
example 'parse the HTML5 doctype' do
|
||||
doctype = '<!DOCTYPE html>'
|
||||
|
||||
parse_html(doctype).should == s( :document, s(:doctype, doctype))
|
||||
end
|
||||
|
||||
example 'parse an HTML 4 strict doctype' do
|
||||
doctype = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" ' \
|
||||
'"http://www.w3.org/TR/html4/strict.dtd">'
|
||||
|
||||
parse_html(doctype).should == s(:document, s(:doctype, doctype))
|
||||
end
|
||||
end
|
||||
end
|
|
@ -20,5 +20,15 @@ module Oga
|
|||
def lex(input)
|
||||
return Oga::Lexer.new.lex(input)
|
||||
end
|
||||
|
||||
##
|
||||
# Parses the given HTML and returns an AST.
|
||||
#
|
||||
# @param [String] input
|
||||
# @return [Oga::AST::Node]
|
||||
#
|
||||
def parse_html(input)
|
||||
return Oga::Parser::HTML.new.parse(input)
|
||||
end
|
||||
end # ParsingHelpers
|
||||
end # Oga
|
||||
|
|
Loading…
Reference in New Issue