365 lines
11 KiB
Ragel
365 lines
11 KiB
Ragel
%%machine xpath_lexer; # %
|
|
|
|
module Oga
|
|
module XPath
|
|
# Lexer for turning XPath expressions into a set of tokens. Tokens are
|
|
# returned as arrays with every array having two values:
|
|
#
|
|
# 1. The token type as a symbol
|
|
# 2. The token value or nil if there is no value
|
|
#
|
|
# Basic usage of this lexer is as following:
|
|
#
|
|
# lexer = Oga::XPath::Lexer.new('//foo/bar')
|
|
# tokens = lexer.lex
|
|
#
|
|
# Alternatively you can stream tokens instead of returning them as a whole:
|
|
#
|
|
# lexer = Oga::XPath::Lexer.new('//foo/bar')
|
|
#
|
|
# lexer.advance do |type, value|
|
|
#
|
|
# end
|
|
#
|
|
# Unlike the XML lexer the XPath lexer does not support IO instances, it can
|
|
# only lex strings.
|
|
#
|
|
# ## Thread Safety
|
|
#
|
|
# This class keeps track of an internal state. As a result it's not safe to
|
|
# share a single instance between multiple threads. However, you're free to
|
|
# use separate instances per thread as there is no global (= class level)
|
|
# shared state.
|
|
#
|
|
# @api private
|
|
class Lexer
|
|
%% write data;
|
|
|
|
# % fix highlight
|
|
|
|
# Maps certain XPath axes written in their short form to their long form
|
|
# equivalents.
|
|
#
|
|
# @return [Hash]
|
|
AXIS_MAPPING = {
|
|
'@' => 'attribute',
|
|
'//' => 'descendant-or-self',
|
|
'..' => 'parent',
|
|
'.' => 'self'
|
|
}
|
|
|
|
# Axes that require a separate `node()` call to be emitted.
|
|
#
|
|
# @return [Array]
|
|
AXIS_EMIT_NODE = %w{descendant-or-self parent self}
|
|
|
|
# Axes that require an extra T_SLASH token to be emitted.
|
|
#
|
|
# @return [Array]
|
|
AXIS_EMIT_EXTRA_SLASH = %w{descendant-or-self}
|
|
|
|
# @param [String] data The data to lex.
|
|
def initialize(data)
|
|
@data = data
|
|
end
|
|
|
|
# Gathers all the tokens for the input and returns them as an Array.
|
|
#
|
|
# @see [#advance]
|
|
# @return [Array]
|
|
def lex
|
|
tokens = []
|
|
|
|
advance do |type, value|
|
|
tokens << [type, value]
|
|
end
|
|
|
|
return tokens
|
|
end
|
|
|
|
# Advances through the input and generates the corresponding tokens. Each
|
|
# token is yielded to the supplied block.
|
|
#
|
|
# Each token is an Array in the following format:
|
|
#
|
|
# [TYPE, VALUE]
|
|
#
|
|
# The type is a symbol, the value is either nil or a String.
|
|
#
|
|
# This method stores the supplied block in `@block` and resets it after
|
|
# the lexer loop has finished.
|
|
#
|
|
# @see [#add_token]
|
|
def advance(&block)
|
|
@block = block
|
|
|
|
data = @data # saves ivar lookups while lexing.
|
|
ts = nil
|
|
te = nil
|
|
stack = []
|
|
top = 0
|
|
cs = self.class.xpath_lexer_start
|
|
act = 0
|
|
eof = @data.bytesize
|
|
p = 0
|
|
pe = eof
|
|
|
|
_xpath_lexer_eof_trans = self.class.send(:_xpath_lexer_eof_trans)
|
|
_xpath_lexer_from_state_actions = self.class.send(:_xpath_lexer_from_state_actions)
|
|
_xpath_lexer_index_offsets = self.class.send(:_xpath_lexer_index_offsets)
|
|
_xpath_lexer_indicies = self.class.send(:_xpath_lexer_indicies)
|
|
_xpath_lexer_key_spans = self.class.send(:_xpath_lexer_key_spans)
|
|
_xpath_lexer_to_state_actions = self.class.send(:_xpath_lexer_to_state_actions)
|
|
_xpath_lexer_trans_actions = self.class.send(:_xpath_lexer_trans_actions)
|
|
_xpath_lexer_trans_keys = self.class.send(:_xpath_lexer_trans_keys)
|
|
_xpath_lexer_trans_targs = self.class.send(:_xpath_lexer_trans_targs)
|
|
|
|
%% write exec;
|
|
|
|
# % fix highlight
|
|
ensure
|
|
@block = nil
|
|
end
|
|
|
|
private
|
|
|
|
# Emits a token of which the value is based on the supplied start/stop
|
|
# position.
|
|
#
|
|
# @param [Symbol] type The token type.
|
|
# @param [Fixnum] start
|
|
# @param [Fixnum] stop
|
|
#
|
|
# @see [#text]
|
|
# @see [#add_token]
|
|
def emit(type, start, stop)
|
|
value = slice_input(start, stop)
|
|
|
|
add_token(type, value)
|
|
end
|
|
|
|
# Returns the text between the specified start and stop position.
|
|
#
|
|
# @param [Fixnum] start
|
|
# @param [Fixnum] stop
|
|
# @return [String]
|
|
def slice_input(start, stop)
|
|
return @data.byteslice(start, stop - start)
|
|
end
|
|
|
|
# Yields a new token to the supplied block.
|
|
#
|
|
# @param [Symbol] type The token type.
|
|
# @param [String] value The token value.
|
|
#
|
|
# @yieldparam [Symbol] type
|
|
# @yieldparam [String|NilClass] value
|
|
def add_token(type, value = nil)
|
|
@block.call(type, value)
|
|
end
|
|
|
|
%%{
|
|
getkey (data.getbyte(p) || 0);
|
|
|
|
whitespace = [\n\t ];
|
|
|
|
slash = '/' @{ add_token(:T_SLASH) };
|
|
lparen = '(' @{ add_token(:T_LPAREN) };
|
|
rparen = ')' @{ add_token(:T_RPAREN) };
|
|
comma = ',' @{ add_token(:T_COMMA) };
|
|
colon = ':' @{ add_token(:T_COLON) };
|
|
lbrack = '[' @{ add_token(:T_LBRACK) };
|
|
rbrack = ']' @{ add_token(:T_RBRACK) };
|
|
|
|
# Identifiers
|
|
#
|
|
# Identifiers are used for element names, namespaces, attribute names,
|
|
# etc. Identifiers have to start with a letter.
|
|
|
|
unicode = any - ascii;
|
|
|
|
unicode_or_ascii = (unicode | [a-zA-Z\-_0-9\.])*;
|
|
|
|
identifier = '*' | (unicode | [a-zA-Z_]) unicode_or_ascii ;
|
|
|
|
action emit_identifier {
|
|
emit(:T_IDENT, ts, te)
|
|
}
|
|
|
|
# Numbers
|
|
#
|
|
# XPath expressions can contain both integers and floats. The W3
|
|
# specification treats these both as the same type of number. Oga
|
|
# instead lexes them separately so that we can convert the values to
|
|
# the corresponding Ruby types (Fixnum and Float).
|
|
|
|
integer = ('-' | '+')* digit+;
|
|
float = ('-' | '+')* digit+ ('.' digit+)*;
|
|
|
|
action emit_integer {
|
|
value = slice_input(ts, te).to_i
|
|
|
|
add_token(:T_INT, value)
|
|
}
|
|
|
|
action emit_float {
|
|
value = slice_input(ts, te).to_f
|
|
|
|
add_token(:T_FLOAT, value)
|
|
}
|
|
|
|
# Strings
|
|
#
|
|
# Strings can be single or double quoted. They are mainly used for
|
|
# attribute values.
|
|
dquote = '"';
|
|
squote = "'";
|
|
|
|
string_dquote = (dquote ^dquote* dquote);
|
|
string_squote = (squote ^squote* squote);
|
|
|
|
string = string_dquote | string_squote;
|
|
|
|
action emit_string {
|
|
emit(:T_STRING, ts + 1, te - 1)
|
|
}
|
|
|
|
# Full Axes
|
|
#
|
|
# XPath axes in their full syntax.
|
|
axis_full = ('ancestor'
|
|
| 'ancestor-or-self'
|
|
| 'attribute'
|
|
| 'child'
|
|
| 'descendant'
|
|
| 'descendant-or-self'
|
|
| 'following'
|
|
| 'following-sibling'
|
|
| 'namespace'
|
|
| 'parent'
|
|
| 'preceding'
|
|
| 'preceding-sibling'
|
|
| 'self') '::';
|
|
|
|
action emit_axis_full {
|
|
emit(:T_AXIS, ts, te - 2)
|
|
}
|
|
|
|
# Short Axes
|
|
#
|
|
# XPath axes in their abbreviated form. When lexing these are mapped to
|
|
# their full forms so that the parser doesn't have to take care of
|
|
# this.
|
|
axis_short = '@' | '//' | '..' | '.';
|
|
|
|
action emit_axis_short {
|
|
value = AXIS_MAPPING[slice_input(ts, te)]
|
|
|
|
add_token(:T_AXIS, value)
|
|
|
|
# Short axes that use node() as their default, implicit test. This is
|
|
# added on lexer level to make it easier to handle these cases on
|
|
# parser/evaluator level.
|
|
if AXIS_EMIT_NODE.include?(value)
|
|
add_token(:T_TYPE_TEST, 'node')
|
|
|
|
if AXIS_EMIT_EXTRA_SLASH.include?(value) and te != eof
|
|
add_token(:T_SLASH)
|
|
end
|
|
end
|
|
}
|
|
|
|
# Operators
|
|
#
|
|
# Operators can only be used inside predicates due to "div" and "mod"
|
|
# conflicting with the patterns used for matching identifiers (=
|
|
# element names and the likes).
|
|
|
|
op_pipe = '|' %{ add_token(:T_PIPE) };
|
|
op_plus = '+' %{ add_token(:T_ADD) };
|
|
op_eq = '=' %{ add_token(:T_EQ) };
|
|
op_neq = '!=' %{ add_token(:T_NEQ) };
|
|
op_lt = '<' %{ add_token(:T_LT) };
|
|
op_gt = '>' %{ add_token(:T_GT) };
|
|
op_lte = '<=' %{ add_token(:T_LTE) };
|
|
op_gte = '>=' %{ add_token(:T_GTE) };
|
|
|
|
# These operators require whitespace around them in order to be lexed
|
|
# as operators. This is due to "-" being allowed in node names and "*"
|
|
# also being used as a whildcard.
|
|
#
|
|
# THINK: relying on whitespace is a rather fragile solution, even
|
|
# though the W3 actually recommends this for the "-" operator. Perhaps
|
|
# there's a better way of doing this.
|
|
|
|
op_and = ' and ' %{ add_token(:T_AND) };
|
|
op_or = ' or ' %{ add_token(:T_OR) };
|
|
op_div = ' div ' %{ add_token(:T_DIV) };
|
|
op_mod = ' mod ' %{ add_token(:T_MOD) };
|
|
op_mul = ' * ' %{ add_token(:T_MUL) };
|
|
op_sub = ' - ' %{ add_token(:T_SUB) };
|
|
|
|
operator = op_pipe
|
|
| op_and
|
|
| op_or
|
|
| op_plus
|
|
| op_div
|
|
| op_mod
|
|
| op_eq
|
|
| op_neq
|
|
| op_lt
|
|
| op_gt
|
|
| op_lte
|
|
| op_gte
|
|
| op_mul
|
|
| op_sub
|
|
;
|
|
|
|
# Node type tests
|
|
#
|
|
# While these look like functions they are actually node type tests. For
|
|
# example, comment() matches all comment nodes.
|
|
#
|
|
# See http://www.w3.org/TR/xpath/#NT-NodeType for more information.
|
|
|
|
type_test = (
|
|
'comment' |
|
|
'text' |
|
|
'processing-instruction' |
|
|
'node'
|
|
) '()';
|
|
|
|
action emit_type_test {
|
|
emit(:T_TYPE_TEST, ts, te - 2)
|
|
}
|
|
|
|
# Variables
|
|
#
|
|
# XPath 1.0 allows the use of variables in expressions. Oddly enough you
|
|
# can not assign variables in an expression, you can only refer to them.
|
|
# This means that libraries themselves have to expose an interface for
|
|
# setting variables.
|
|
var = '$' identifier;
|
|
|
|
action emit_variable {
|
|
emit(:T_VAR, ts + 1, te)
|
|
}
|
|
|
|
main := |*
|
|
operator;
|
|
whitespace | slash | lparen | rparen | comma | colon | lbrack | rbrack;
|
|
|
|
type_test => emit_type_test;
|
|
var => emit_variable;
|
|
string => emit_string;
|
|
integer => emit_integer;
|
|
float => emit_float;
|
|
axis_full => emit_axis_full;
|
|
axis_short => emit_axis_short;
|
|
identifier => emit_identifier;
|
|
*|;
|
|
}%%
|
|
end # Lexer
|
|
end # XPath
|
|
end # Oga
|