Lexing of doctypes.
This comes with various structural changes to the lexer as I'm slowly starting to get the hang of Ragel. Ragel is a beast but damn it's an awesome piece of software. Note that the doctype public/system IDs are lexed as T_STRING. The parser will figure out whether a ID is a public or system ID based on the order. This fixes #1
This commit is contained in:
parent
3c825afee0
commit
ca6f422036
120
lib/oga/lexer.rl
120
lib/oga/lexer.rl
|
@ -27,6 +27,8 @@ module Oga
|
||||||
@ts = nil
|
@ts = nil
|
||||||
@te = nil
|
@te = nil
|
||||||
@tokens = []
|
@tokens = []
|
||||||
|
@stack = []
|
||||||
|
@top = 0
|
||||||
end
|
end
|
||||||
|
|
||||||
def lex(data)
|
def lex(data)
|
||||||
|
@ -73,6 +75,13 @@ module Oga
|
||||||
@tokens << token
|
@tokens << token
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def emit_string_buffer
|
||||||
|
add_token(:T_STRING, @string_buffer)
|
||||||
|
advance_column
|
||||||
|
|
||||||
|
@string_buffer = nil
|
||||||
|
end
|
||||||
|
|
||||||
%%{
|
%%{
|
||||||
# Use instance variables for `ts` and friends.
|
# Use instance variables for `ts` and friends.
|
||||||
access @;
|
access @;
|
||||||
|
@ -80,22 +89,85 @@ module Oga
|
||||||
newline = '\n' | '\r\n';
|
newline = '\n' | '\r\n';
|
||||||
whitespace = [ \t];
|
whitespace = [ \t];
|
||||||
|
|
||||||
|
action emit_space {
|
||||||
|
t(:T_SPACE)
|
||||||
|
}
|
||||||
|
|
||||||
|
action emit_newline {
|
||||||
|
t(:T_NEWLINE)
|
||||||
|
advance_line
|
||||||
|
}
|
||||||
|
|
||||||
|
# String processing
|
||||||
|
#
|
||||||
|
# These actions/definitions can be used to process single and/or double
|
||||||
|
# quoted strings (e.g. for tag attribute values).
|
||||||
|
#
|
||||||
|
# The string_dquote and string_squote machines should not be used
|
||||||
|
# directly, instead the corresponding actions should be used.
|
||||||
|
#
|
||||||
|
dquote = '"';
|
||||||
|
squote = "'";
|
||||||
|
|
||||||
|
action buffer_string {
|
||||||
|
@string_buffer ||= ''
|
||||||
|
@string_buffer << text
|
||||||
|
}
|
||||||
|
|
||||||
|
action string_dquote {
|
||||||
|
advance_column
|
||||||
|
fcall string_dquote;
|
||||||
|
}
|
||||||
|
|
||||||
|
action string_squote {
|
||||||
|
advance_column
|
||||||
|
fcall string_squote;
|
||||||
|
}
|
||||||
|
|
||||||
|
string_dquote := |*
|
||||||
|
^dquote => buffer_string;
|
||||||
|
dquote => {
|
||||||
|
emit_string_buffer
|
||||||
|
fret;
|
||||||
|
};
|
||||||
|
*|;
|
||||||
|
|
||||||
|
string_squote := |*
|
||||||
|
^squote => buffer_string;
|
||||||
|
squote => {
|
||||||
|
emit_string_buffer
|
||||||
|
fret;
|
||||||
|
};
|
||||||
|
*|;
|
||||||
|
|
||||||
# DOCTYPES
|
# DOCTYPES
|
||||||
#
|
#
|
||||||
# http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
|
# http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
|
||||||
#
|
#
|
||||||
# Doctypes are treated with some extra care on lexer level to make the
|
# These rules support the 3 flavours of doctypes:
|
||||||
# parser's life easier. If they were treated as regular text it would be
|
|
||||||
# a pain to specify a proper doctype in Racc since it can't match on a
|
|
||||||
# token's value (only on its type).
|
|
||||||
#
|
#
|
||||||
# Doctype parsing is also relaxed compared to the W3 specification. For
|
# 1. Normal doctypes, as introduced in the HTML5 specification.
|
||||||
# example, the specification defines 4 doctype formats each having
|
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
|
||||||
# different rules. Because Oga doesn't really use the doctype for
|
# 3. Legacy doctypes
|
||||||
# anything we'll just slap all the formats into a single rule. Easy
|
#
|
||||||
# enough.
|
doctype_start = '<!DOCTYPE'i whitespace+ 'HTML'i;
|
||||||
doctype = '<' whitespace* '!' whitespace* 'DOCTYPE'i whitespace*
|
|
||||||
'HTML'i whitespace* any* '>';
|
doctype := |*
|
||||||
|
'PUBLIC' | 'SYSTEM' => { t(:T_DOCTYPE_TYPE) };
|
||||||
|
|
||||||
|
# Lex the public/system IDs as regular strings.
|
||||||
|
dquote => string_dquote;
|
||||||
|
squote => string_squote;
|
||||||
|
|
||||||
|
# Whitespace inside doctypes is ignored since there's no point in
|
||||||
|
# including it.
|
||||||
|
whitespace => { advance_column };
|
||||||
|
|
||||||
|
'>' => {
|
||||||
|
t(:T_DOCTYPE_END)
|
||||||
|
fgoto main;
|
||||||
|
};
|
||||||
|
*|;
|
||||||
|
|
||||||
# CDATA
|
# CDATA
|
||||||
#
|
#
|
||||||
|
@ -111,12 +183,6 @@ module Oga
|
||||||
cdata_end = ']]>';
|
cdata_end = ']]>';
|
||||||
|
|
||||||
cdata := |*
|
cdata := |*
|
||||||
cdata_start => {
|
|
||||||
t(:T_CDATA_START)
|
|
||||||
|
|
||||||
@cdata_buffer = ''
|
|
||||||
};
|
|
||||||
|
|
||||||
cdata_end => {
|
cdata_end => {
|
||||||
add_token(:T_TEXT, @cdata_buffer)
|
add_token(:T_TEXT, @cdata_buffer)
|
||||||
@cdata_buffer = nil
|
@cdata_buffer = nil
|
||||||
|
@ -132,13 +198,23 @@ module Oga
|
||||||
*|;
|
*|;
|
||||||
|
|
||||||
main := |*
|
main := |*
|
||||||
whitespace => { t(:T_SPACE) };
|
whitespace => emit_space;
|
||||||
newline => { t(:T_NEWLINE); advance_line };
|
newline => emit_newline;
|
||||||
|
|
||||||
doctype => { t(:T_DOCTYPE) };
|
doctype_start => {
|
||||||
|
t(:T_DOCTYPE_START)
|
||||||
|
|
||||||
# Jump to the cdata machine right away without processing anything.
|
fgoto doctype;
|
||||||
cdata_start >{ fhold; fgoto cdata; };
|
};
|
||||||
|
|
||||||
|
# @cdata_buffer is used to store the content of the CDATA tag.
|
||||||
|
cdata_start => {
|
||||||
|
t(:T_CDATA_START)
|
||||||
|
|
||||||
|
@cdata_buffer = ''
|
||||||
|
|
||||||
|
fgoto cdata;
|
||||||
|
};
|
||||||
|
|
||||||
# General rules and actions.
|
# General rules and actions.
|
||||||
'<' => { t(:T_SMALLER) };
|
'<' => { t(:T_SMALLER) };
|
||||||
|
|
|
@ -4,13 +4,28 @@ describe Oga::Lexer do
|
||||||
context 'doctypes' do
|
context 'doctypes' do
|
||||||
example 'lex the HTML5 doctype' do
|
example 'lex the HTML5 doctype' do
|
||||||
lex('<!DOCTYPE html>').should == [
|
lex('<!DOCTYPE html>').should == [
|
||||||
[:T_DOCTYPE, '<!DOCTYPE html>', 1, 1]
|
[:T_DOCTYPE_START, '<!DOCTYPE html', 1, 1],
|
||||||
|
[:T_DOCTYPE_END, '>', 1, 15]
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
example 'lex a random doctype' do
|
example 'lex a doctype with a public and system ID' do
|
||||||
lex('<!DOCTYPE HTML PUBLIC "foobar" "baz">').should == [
|
lex('<!DOCTYPE HTML PUBLIC "foobar" "baz">').should == [
|
||||||
[:T_DOCTYPE, '<!DOCTYPE HTML PUBLIC "foobar" "baz">', 1, 1]
|
[:T_DOCTYPE_START, '<!DOCTYPE HTML', 1, 1],
|
||||||
|
[:T_DOCTYPE_TYPE, 'PUBLIC', 1, 16],
|
||||||
|
[:T_STRING, 'foobar', 1, 24],
|
||||||
|
[:T_STRING, 'baz', 1, 33],
|
||||||
|
[:T_DOCTYPE_END, '>', 1, 37]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
example 'lex a doctype with a public and system ID using single quotes' do
|
||||||
|
lex("<!DOCTYPE HTML PUBLIC 'foobar' 'baz'>").should == [
|
||||||
|
[:T_DOCTYPE_START, '<!DOCTYPE HTML', 1, 1],
|
||||||
|
[:T_DOCTYPE_TYPE, 'PUBLIC', 1, 16],
|
||||||
|
[:T_STRING, 'foobar', 1, 24],
|
||||||
|
[:T_STRING, 'baz', 1, 33],
|
||||||
|
[:T_DOCTYPE_END, '>', 1, 37]
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue