Lexing of doctypes.

This comes with various structural changes to the lexer as I'm slowly starting
to get the hang of Ragel. Ragel is a beast but damn it's an awesome piece of
software.

Note that the doctype public/system IDs are lexed as T_STRING. The parser will
figure out whether a ID is a public or system ID based on the order.

This fixes #1
This commit is contained in:
Yorick Peterse 2014-02-28 23:08:55 +01:00
parent 3c825afee0
commit ca6f422036
2 changed files with 116 additions and 25 deletions

View File

@ -27,6 +27,8 @@ module Oga
@ts = nil
@te = nil
@tokens = []
@stack = []
@top = 0
end
def lex(data)
@ -73,6 +75,13 @@ module Oga
@tokens << token
end
def emit_string_buffer
add_token(:T_STRING, @string_buffer)
advance_column
@string_buffer = nil
end
%%{
# Use instance variables for `ts` and friends.
access @;
@ -80,22 +89,85 @@ module Oga
newline = '\n' | '\r\n';
whitespace = [ \t];
action emit_space {
t(:T_SPACE)
}
action emit_newline {
t(:T_NEWLINE)
advance_line
}
# String processing
#
# These actions/definitions can be used to process single and/or double
# quoted strings (e.g. for tag attribute values).
#
# The string_dquote and string_squote machines should not be used
# directly, instead the corresponding actions should be used.
#
dquote = '"';
squote = "'";
action buffer_string {
@string_buffer ||= ''
@string_buffer << text
}
action string_dquote {
advance_column
fcall string_dquote;
}
action string_squote {
advance_column
fcall string_squote;
}
string_dquote := |*
^dquote => buffer_string;
dquote => {
emit_string_buffer
fret;
};
*|;
string_squote := |*
^squote => buffer_string;
squote => {
emit_string_buffer
fret;
};
*|;
# DOCTYPES
#
# http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
#
# Doctypes are treated with some extra care on lexer level to make the
# parser's life easier. If they were treated as regular text it would be
# a pain to specify a proper doctype in Racc since it can't match on a
# token's value (only on its type).
# These rules support the 3 flavours of doctypes:
#
# Doctype parsing is also relaxed compared to the W3 specification. For
# example, the specification defines 4 doctype formats each having
# different rules. Because Oga doesn't really use the doctype for
# anything we'll just slap all the formats into a single rule. Easy
# enough.
doctype = '<' whitespace* '!' whitespace* 'DOCTYPE'i whitespace*
'HTML'i whitespace* any* '>';
# 1. Normal doctypes, as introduced in the HTML5 specification.
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
# 3. Legacy doctypes
#
doctype_start = '<!DOCTYPE'i whitespace+ 'HTML'i;
doctype := |*
'PUBLIC' | 'SYSTEM' => { t(:T_DOCTYPE_TYPE) };
# Lex the public/system IDs as regular strings.
dquote => string_dquote;
squote => string_squote;
# Whitespace inside doctypes is ignored since there's no point in
# including it.
whitespace => { advance_column };
'>' => {
t(:T_DOCTYPE_END)
fgoto main;
};
*|;
# CDATA
#
@ -111,12 +183,6 @@ module Oga
cdata_end = ']]>';
cdata := |*
cdata_start => {
t(:T_CDATA_START)
@cdata_buffer = ''
};
cdata_end => {
add_token(:T_TEXT, @cdata_buffer)
@cdata_buffer = nil
@ -132,13 +198,23 @@ module Oga
*|;
main := |*
whitespace => { t(:T_SPACE) };
newline => { t(:T_NEWLINE); advance_line };
whitespace => emit_space;
newline => emit_newline;
doctype => { t(:T_DOCTYPE) };
doctype_start => {
t(:T_DOCTYPE_START)
# Jump to the cdata machine right away without processing anything.
cdata_start >{ fhold; fgoto cdata; };
fgoto doctype;
};
# @cdata_buffer is used to store the content of the CDATA tag.
cdata_start => {
t(:T_CDATA_START)
@cdata_buffer = ''
fgoto cdata;
};
# General rules and actions.
'<' => { t(:T_SMALLER) };

View File

@ -4,13 +4,28 @@ describe Oga::Lexer do
context 'doctypes' do
example 'lex the HTML5 doctype' do
lex('<!DOCTYPE html>').should == [
[:T_DOCTYPE, '<!DOCTYPE html>', 1, 1]
[:T_DOCTYPE_START, '<!DOCTYPE html', 1, 1],
[:T_DOCTYPE_END, '>', 1, 15]
]
end
example 'lex a random doctype' do
example 'lex a doctype with a public and system ID' do
lex('<!DOCTYPE HTML PUBLIC "foobar" "baz">').should == [
[:T_DOCTYPE, '<!DOCTYPE HTML PUBLIC "foobar" "baz">', 1, 1]
[:T_DOCTYPE_START, '<!DOCTYPE HTML', 1, 1],
[:T_DOCTYPE_TYPE, 'PUBLIC', 1, 16],
[:T_STRING, 'foobar', 1, 24],
[:T_STRING, 'baz', 1, 33],
[:T_DOCTYPE_END, '>', 1, 37]
]
end
example 'lex a doctype with a public and system ID using single quotes' do
lex("<!DOCTYPE HTML PUBLIC 'foobar' 'baz'>").should == [
[:T_DOCTYPE_START, '<!DOCTYPE HTML', 1, 1],
[:T_DOCTYPE_TYPE, 'PUBLIC', 1, 16],
[:T_STRING, 'foobar', 1, 24],
[:T_STRING, 'baz', 1, 33],
[:T_DOCTYPE_END, '>', 1, 37]
]
end
end