Support for lexing XPath wildcard expressions.

To support this we need to require whitespace around the "*" operator. This is
not ideal but it will do for now.
This commit is contained in:
Yorick Peterse 2014-06-01 23:01:24 +02:00
parent 48bf1a0628
commit 54de2df0c7
2 changed files with 68 additions and 23 deletions

View File

@ -3,13 +3,19 @@
module Oga module Oga
module XPath module XPath
## ##
# Ragel lexer for lexing XPath queries. # Ragel lexer for lexing XPath expressions.
# #
class Lexer class Lexer
%% write data; %% write data;
# % fix highlight # % fix highlight
##
# Maps certain XPath axes written in their short form to their long form
# equivalents.
#
# @return [Hash]
#
AXIS_MAPPING = { AXIS_MAPPING = {
'@' => 'attribute', '@' => 'attribute',
'//' => 'descendant-or-self', '//' => 'descendant-or-self',
@ -22,23 +28,11 @@ module Oga
# #
def initialize(data) def initialize(data)
@data = data @data = data
reset
end
##
# Resets the internal state of the lexer.
#
def reset
end end
## ##
# Gathers all the tokens for the input and returns them as an Array. # Gathers all the tokens for the input and returns them as an Array.
# #
# This method resets the internal state of the lexer after consuming the
# input.
#
# @see [#advance] # @see [#advance]
# @return [Array] # @return [Array]
# #
@ -49,8 +43,6 @@ module Oga
tokens << [type, value] tokens << [type, value]
end end
reset
return tokens return tokens
end end
@ -67,8 +59,6 @@ module Oga
# This method stores the supplied block in `@block` and resets it after # This method stores the supplied block in `@block` and resets it after
# the lexer loop has finished. # the lexer loop has finished.
# #
# This method does *not* reset the internal state of the lexer.
#
# @param [String] data The String to consume. # @param [String] data The String to consume.
# @return [Array] # @return [Array]
# #
@ -153,6 +143,7 @@ module Oga
rparen = ')' @{ add_token(:T_RPAREN) }; rparen = ')' @{ add_token(:T_RPAREN) };
comma = ',' @{ add_token(:T_COMMA) }; comma = ',' @{ add_token(:T_COMMA) };
colon = ':' @{ add_token(:T_COLON) }; colon = ':' @{ add_token(:T_COLON) };
star = '*' @{ add_token(:T_STAR) };
# Identifiers # Identifiers
# #
@ -250,8 +241,6 @@ module Oga
| 'and' | 'and'
| 'or' | 'or'
| '+' | '+'
| '-'
| '*'
| 'div' | 'div'
| 'mod' | 'mod'
| '=' | '='
@ -261,21 +250,36 @@ module Oga
| '<=' | '<='
| '>='; | '>=';
# These operators require whitespace around them in order to be lexed
# as operators. This is due to "-" being allowed in node names and "*"
# also being used as a whildcard.
#
# THINK: relying on whitespace is a rather fragile solution, even
# though the W3 actually recommends this for the "-" operator. Perhaps
# there's a better way of doing this.
space_operator = space ('*' | '-') space;
action emit_operator { action emit_operator {
emit(:T_OP, ts, te) emit(:T_OP, ts, te)
} }
action emit_space_operator {
emit(:T_OP, ts + 1, te - 1)
}
# Machine that handles the lexing of data inside an XPath predicate. # Machine that handles the lexing of data inside an XPath predicate.
# When bumping into a "]" the lexer jumps back to the `main` machine. # When bumping into a "]" the lexer jumps back to the `main` machine.
predicate := |* predicate := |*
whitespace | slash | lparen | rparen | comma | colon; whitespace | slash | lparen | rparen | comma | colon | star;
operator => emit_operator;
space_operator => emit_space_operator;
string => emit_string; string => emit_string;
integer => emit_integer; integer => emit_integer;
float => emit_float; float => emit_float;
axis_full => emit_axis_full; axis_full => emit_axis_full;
axis_short => emit_axis_short; axis_short => emit_axis_short;
operator => emit_operator;
identifier => emit_identifier; identifier => emit_identifier;
']' => { ']' => {
@ -285,7 +289,7 @@ module Oga
*|; *|;
main := |* main := |*
whitespace | slash | lparen | rparen | comma | colon; whitespace | slash | lparen | rparen | comma | colon | star;
'[' => { '[' => {
add_token(:T_LBRACK) add_token(:T_LBRACK)

View File

@ -111,7 +111,20 @@ describe Oga::XPath::Lexer do
] ]
end end
example 'lex a predicate expression using an operator' do example 'lex a whildcard node test' do
lex_xpath('/*').should == [[:T_SLASH, nil], [:T_STAR, nil]]
end
example 'lex a wildcard node test for a namespace' do
lex_xpath('/*:foo').should == [
[:T_SLASH, nil],
[:T_STAR, nil],
[:T_COLON, nil],
[:T_IDENT, 'foo']
]
end
example 'lex a predicate expression using the div operator' do
lex_xpath('/div[@number=4 div 2]').should == [ lex_xpath('/div[@number=4 div 2]').should == [
[:T_SLASH, nil], [:T_SLASH, nil],
[:T_IDENT, 'div'], [:T_IDENT, 'div'],
@ -126,6 +139,21 @@ describe Oga::XPath::Lexer do
] ]
end end
example 'lex a predicate expression using the * operator' do
lex_xpath('/div[@number=4 * 2]').should == [
[:T_SLASH, nil],
[:T_IDENT, 'div'],
[:T_LBRACK, nil],
[:T_AXIS, 'attribute'],
[:T_IDENT, 'number'],
[:T_OP, '='],
[:T_INT, 4],
[:T_OP, '*'],
[:T_INT, 2],
[:T_RBRACK, nil]
]
end
example 'lex a predicate expression using axes' do example 'lex a predicate expression using axes' do
lex_xpath('/div[/foo/bar]').should == [ lex_xpath('/div[/foo/bar]').should == [
[:T_SLASH, nil], [:T_SLASH, nil],
@ -139,6 +167,19 @@ describe Oga::XPath::Lexer do
] ]
end end
example 'lex a predicate expression using a wildcard' do
lex_xpath('/div[/foo/*]').should == [
[:T_SLASH, nil],
[:T_IDENT, 'div'],
[:T_LBRACK, nil],
[:T_SLASH, nil],
[:T_IDENT, 'foo'],
[:T_SLASH, nil],
[:T_STAR, nil],
[:T_RBRACK, nil]
]
end
# The following are a bunch of examples taken from Wikipedia and the W3 spec # The following are a bunch of examples taken from Wikipedia and the W3 spec
# to see how the lexer handles them. # to see how the lexer handles them.