393 lines
8.8 KiB
Ragel
393 lines
8.8 KiB
Ragel
%%machine lexer; # %
|
|
|
|
module Oga
|
|
##
|
|
#
|
|
class Lexer
|
|
%% write data; # %
|
|
|
|
attr_reader :html
|
|
|
|
HTML_VOID_ELEMENTS = [
|
|
'area',
|
|
'base',
|
|
'br',
|
|
'col',
|
|
'command',
|
|
'embed',
|
|
'hr',
|
|
'img',
|
|
'input',
|
|
'keygen',
|
|
'link',
|
|
'meta',
|
|
'param',
|
|
'source',
|
|
'track',
|
|
'wbr'
|
|
]
|
|
|
|
# Lazy way of forwarding instance method calls used internally by Ragel to
|
|
# their corresponding class methods.
|
|
private_methods.grep(/^_lexer_/).each do |name|
|
|
define_method(name) do
|
|
return self.class.send(name)
|
|
end
|
|
|
|
private(name)
|
|
end
|
|
|
|
def initialize(options = {})
|
|
options.each do |key, value|
|
|
instance_variable_set("@#{key}", value) if respond_to?(key)
|
|
end
|
|
|
|
reset
|
|
end
|
|
|
|
def reset
|
|
@line = 1
|
|
@column = 1
|
|
@data = nil
|
|
@ts = nil
|
|
@te = nil
|
|
@tokens = []
|
|
@stack = []
|
|
@top = 0
|
|
@elements = []
|
|
|
|
@string_buffer = ''
|
|
@text_buffer = ''
|
|
end
|
|
|
|
def lex(data)
|
|
@data = data
|
|
lexer_start = self.class.lexer_start
|
|
eof = data.length
|
|
|
|
%% write init;
|
|
%% write exec;
|
|
|
|
tokens = @tokens
|
|
|
|
reset
|
|
|
|
return tokens
|
|
end
|
|
|
|
def html?
|
|
return !!html
|
|
end
|
|
|
|
private
|
|
|
|
def advance_line(amount = 1)
|
|
@line += amount
|
|
@column = 1
|
|
end
|
|
|
|
def advance_column(length = 1)
|
|
@column += length
|
|
end
|
|
|
|
def t(type, start = @ts, stop = @te)
|
|
value = text(start, stop)
|
|
|
|
add_token(type, value)
|
|
end
|
|
|
|
def text(start = @ts, stop = @te)
|
|
return @data[start...stop]
|
|
end
|
|
|
|
def add_token(type, value)
|
|
token = [type, value, @line, @column]
|
|
|
|
advance_column(value.length) if value
|
|
|
|
@tokens << token
|
|
end
|
|
|
|
def emit_text_buffer
|
|
return if @text_buffer.empty?
|
|
|
|
add_token(:T_TEXT, @text_buffer)
|
|
|
|
lines = @text_buffer.count("\n")
|
|
|
|
advance_line(lines) if lines > 0
|
|
|
|
@text_buffer = ''
|
|
end
|
|
|
|
def buffer_text_until_eof(eof)
|
|
@text_buffer << text
|
|
|
|
emit_text_buffer if @te == eof
|
|
end
|
|
|
|
def emit_string_buffer
|
|
add_token(:T_STRING, @string_buffer)
|
|
advance_column
|
|
|
|
@string_buffer = ''
|
|
end
|
|
|
|
def current_element
|
|
return @elements.last
|
|
end
|
|
|
|
%%{
|
|
# Use instance variables for `ts` and friends.
|
|
access @;
|
|
|
|
newline = '\n' | '\r\n';
|
|
whitespace = [ \t];
|
|
|
|
# Strings
|
|
#
|
|
# Strings in HTML can either be single or double quoted. If a string
|
|
# starts with one of these quotes it must be closed with the same type of
|
|
# quote.
|
|
dquote = '"';
|
|
squote = "'";
|
|
|
|
action buffer_text {
|
|
@text_buffer << text
|
|
}
|
|
|
|
action buffer_string {
|
|
@string_buffer << text
|
|
}
|
|
|
|
action start_string_dquote {
|
|
fcall string_dquote;
|
|
}
|
|
|
|
action start_string_squote {
|
|
fcall string_squote;
|
|
}
|
|
|
|
# Machine for processing double quoted strings.
|
|
string_dquote := |*
|
|
^dquote => buffer_string;
|
|
dquote => {
|
|
emit_string_buffer
|
|
advance_column
|
|
fret;
|
|
};
|
|
*|;
|
|
|
|
# Machine for processing single quoted strings.
|
|
string_squote := |*
|
|
^squote => buffer_string;
|
|
squote => {
|
|
emit_string_buffer
|
|
advance_column
|
|
fret;
|
|
};
|
|
*|;
|
|
|
|
# DOCTYPES
|
|
#
|
|
# http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
|
|
#
|
|
# These rules support the 3 flavours of doctypes:
|
|
#
|
|
# 1. Normal doctypes, as introduced in the HTML5 specification.
|
|
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
|
|
# 3. Legacy doctypes
|
|
#
|
|
doctype_start = '<!DOCTYPE'i whitespace+ 'HTML'i;
|
|
|
|
action start_doctype {
|
|
emit_text_buffer
|
|
t(:T_DOCTYPE_START)
|
|
fcall doctype;
|
|
}
|
|
|
|
# Machine for processing doctypes. Doctype values such as the public and
|
|
# system IDs are treated as T_STRING tokens.
|
|
doctype := |*
|
|
'PUBLIC' | 'SYSTEM' => { t(:T_DOCTYPE_TYPE) };
|
|
|
|
# Lex the public/system IDs as regular strings.
|
|
dquote => start_string_dquote;
|
|
squote => start_string_squote;
|
|
|
|
# Whitespace inside doctypes is ignored since there's no point in
|
|
# including it.
|
|
whitespace => { advance_column };
|
|
|
|
'>' => {
|
|
t(:T_DOCTYPE_END)
|
|
fret;
|
|
};
|
|
*|;
|
|
|
|
# CDATA
|
|
#
|
|
# http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
|
|
#
|
|
# CDATA tags are broken up into 3 parts: the start, the content and the
|
|
# end tag.
|
|
#
|
|
# In HTML CDATA tags have no meaning/are not supported. Oga does
|
|
# support them but treats their contents as plain text.
|
|
#
|
|
cdata_start = '<![CDATA[';
|
|
cdata_end = ']]>';
|
|
|
|
action start_cdata {
|
|
emit_text_buffer
|
|
t(:T_CDATA_START)
|
|
fcall cdata;
|
|
}
|
|
|
|
# Machine that for processing the contents of CDATA tags. Everything
|
|
# inside a CDATA tag is treated as plain text.
|
|
cdata := |*
|
|
cdata_end => {
|
|
emit_text_buffer
|
|
t(:T_CDATA_END)
|
|
fret;
|
|
};
|
|
|
|
any => buffer_text;
|
|
*|;
|
|
|
|
# Comments
|
|
#
|
|
# http://www.w3.org/TR/html-markup/syntax.html#comments
|
|
#
|
|
# Comments are lexed into 3 parts: the start tag, the content and the end
|
|
# tag.
|
|
#
|
|
# Unlike the W3 specification these rules *do* allow character sequences
|
|
# such as `--` and `->`. Putting extra checks in for these sequences
|
|
# would actually make the rules/actions more complex.
|
|
#
|
|
comment_start = '<!--';
|
|
comment_end = '-->';
|
|
|
|
action start_comment {
|
|
emit_text_buffer
|
|
t(:T_COMMENT_START)
|
|
fcall comment;
|
|
}
|
|
|
|
# Machine used for processing the contents of a comment. Everything
|
|
# inside a comment is treated as plain text (similar to CDATA tags).
|
|
comment := |*
|
|
comment_end => {
|
|
emit_text_buffer
|
|
t(:T_COMMENT_END)
|
|
fret;
|
|
};
|
|
|
|
any => buffer_text;
|
|
*|;
|
|
|
|
# Elements
|
|
#
|
|
# http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
|
|
#
|
|
|
|
# Action that creates the tokens for the opening tag, name and namespace
|
|
# (if any). Remaining work is delegated to a dedicated machine.
|
|
action start_element {
|
|
emit_text_buffer
|
|
add_token(:T_ELEM_OPEN, nil)
|
|
advance_column
|
|
|
|
# Add the element name. If the name includes a namespace we'll break
|
|
# the name up into two separate tokens.
|
|
name = text(@ts + 1)
|
|
|
|
if name.include?(':')
|
|
ns, name = name.split(':')
|
|
|
|
add_token(:T_ELEM_NS, ns)
|
|
|
|
# Advance the column for the colon (:) that separates the namespace
|
|
# and element name.
|
|
advance_column
|
|
end
|
|
|
|
@elements << name
|
|
|
|
add_token(:T_ELEM_NAME, name)
|
|
|
|
fcall element_head;
|
|
}
|
|
|
|
element_name = [a-zA-Z0-9\-_:]+;
|
|
element_start = '<' element_name;
|
|
|
|
# Machine used for processing the characters inside a element head. An
|
|
# element head is everything between `<NAME` (where NAME is the element
|
|
# name) and `>`.
|
|
#
|
|
# For example, in `<p foo="bar">` the element head is ` foo="bar"`.
|
|
#
|
|
element_head := |*
|
|
(whitespace | '=') => { advance_column };
|
|
|
|
# Attribute names.
|
|
element_name => { t(:T_ATTR) };
|
|
|
|
# Attribute values.
|
|
dquote => start_string_dquote;
|
|
squote => start_string_squote;
|
|
|
|
# The closing character of the open tag.
|
|
('>' | '/') => {
|
|
fhold;
|
|
fret;
|
|
};
|
|
*|;
|
|
|
|
main := |*
|
|
element_start => start_element;
|
|
doctype_start => start_doctype;
|
|
cdata_start => start_cdata;
|
|
comment_start => start_comment;
|
|
element_start => start_element;
|
|
|
|
# Enter the body of the tag. If HTML mode is enabled and the current
|
|
# element is a void element we'll close it and bail out.
|
|
'>' => {
|
|
if html? and HTML_VOID_ELEMENTS.include?(current_element)
|
|
add_token(:T_ELEM_CLOSE, nil)
|
|
@elements.pop
|
|
end
|
|
|
|
advance_column
|
|
};
|
|
|
|
# Regular closing tags.
|
|
'</' element_name '>' => {
|
|
emit_text_buffer
|
|
add_token(:T_ELEM_CLOSE, nil)
|
|
|
|
advance_column(@te - @ts)
|
|
|
|
@elements.pop
|
|
};
|
|
|
|
# Self closing elements that are not handled by the HTML mode.
|
|
'/>' => {
|
|
advance_column
|
|
add_token(:T_ELEM_CLOSE, nil)
|
|
|
|
@elements.pop
|
|
};
|
|
|
|
# Note that this rule should be declared at the very bottom as it will
|
|
# otherwise take precedence over the other rules.
|
|
any => { buffer_text_until_eof(eof) };
|
|
*|;
|
|
}%%
|
|
end # Lexer
|
|
end # Oga
|