Support for lexing XPath wildcard expressions.

To support this we need to require whitespace around the "*" operator. This is
not ideal but it will do for now.
This commit is contained in:
Yorick Peterse 2014-06-01 23:01:24 +02:00
parent 48bf1a0628
commit 54de2df0c7
2 changed files with 68 additions and 23 deletions

View File

@ -3,13 +3,19 @@
module Oga
module XPath
##
# Ragel lexer for lexing XPath queries.
# Ragel lexer for lexing XPath expressions.
#
class Lexer
%% write data;
# % fix highlight
##
# Maps certain XPath axes written in their short form to their long form
# equivalents.
#
# @return [Hash]
#
AXIS_MAPPING = {
'@' => 'attribute',
'//' => 'descendant-or-self',
@ -22,23 +28,11 @@ module Oga
#
def initialize(data)
@data = data
reset
end
##
# Resets the internal state of the lexer.
#
def reset
end
##
# Gathers all the tokens for the input and returns them as an Array.
#
# This method resets the internal state of the lexer after consuming the
# input.
#
# @see [#advance]
# @return [Array]
#
@ -49,8 +43,6 @@ module Oga
tokens << [type, value]
end
reset
return tokens
end
@ -67,8 +59,6 @@ module Oga
# This method stores the supplied block in `@block` and resets it after
# the lexer loop has finished.
#
# This method does *not* reset the internal state of the lexer.
#
# @param [String] data The String to consume.
# @return [Array]
#
@ -153,6 +143,7 @@ module Oga
rparen = ')' @{ add_token(:T_RPAREN) };
comma = ',' @{ add_token(:T_COMMA) };
colon = ':' @{ add_token(:T_COLON) };
star = '*' @{ add_token(:T_STAR) };
# Identifiers
#
@ -250,8 +241,6 @@ module Oga
| 'and'
| 'or'
| '+'
| '-'
| '*'
| 'div'
| 'mod'
| '='
@ -261,21 +250,36 @@ module Oga
| '<='
| '>=';
# These operators require whitespace around them in order to be lexed
# as operators. This is due to "-" being allowed in node names and "*"
# also being used as a whildcard.
#
# THINK: relying on whitespace is a rather fragile solution, even
# though the W3 actually recommends this for the "-" operator. Perhaps
# there's a better way of doing this.
space_operator = space ('*' | '-') space;
action emit_operator {
emit(:T_OP, ts, te)
}
action emit_space_operator {
emit(:T_OP, ts + 1, te - 1)
}
# Machine that handles the lexing of data inside an XPath predicate.
# When bumping into a "]" the lexer jumps back to the `main` machine.
predicate := |*
whitespace | slash | lparen | rparen | comma | colon;
whitespace | slash | lparen | rparen | comma | colon | star;
operator => emit_operator;
space_operator => emit_space_operator;
string => emit_string;
integer => emit_integer;
float => emit_float;
axis_full => emit_axis_full;
axis_short => emit_axis_short;
operator => emit_operator;
identifier => emit_identifier;
']' => {
@ -285,7 +289,7 @@ module Oga
*|;
main := |*
whitespace | slash | lparen | rparen | comma | colon;
whitespace | slash | lparen | rparen | comma | colon | star;
'[' => {
add_token(:T_LBRACK)

View File

@ -111,7 +111,20 @@ describe Oga::XPath::Lexer do
]
end
example 'lex a predicate expression using an operator' do
example 'lex a whildcard node test' do
lex_xpath('/*').should == [[:T_SLASH, nil], [:T_STAR, nil]]
end
example 'lex a wildcard node test for a namespace' do
lex_xpath('/*:foo').should == [
[:T_SLASH, nil],
[:T_STAR, nil],
[:T_COLON, nil],
[:T_IDENT, 'foo']
]
end
example 'lex a predicate expression using the div operator' do
lex_xpath('/div[@number=4 div 2]').should == [
[:T_SLASH, nil],
[:T_IDENT, 'div'],
@ -126,6 +139,21 @@ describe Oga::XPath::Lexer do
]
end
example 'lex a predicate expression using the * operator' do
lex_xpath('/div[@number=4 * 2]').should == [
[:T_SLASH, nil],
[:T_IDENT, 'div'],
[:T_LBRACK, nil],
[:T_AXIS, 'attribute'],
[:T_IDENT, 'number'],
[:T_OP, '='],
[:T_INT, 4],
[:T_OP, '*'],
[:T_INT, 2],
[:T_RBRACK, nil]
]
end
example 'lex a predicate expression using axes' do
lex_xpath('/div[/foo/bar]').should == [
[:T_SLASH, nil],
@ -139,6 +167,19 @@ describe Oga::XPath::Lexer do
]
end
example 'lex a predicate expression using a wildcard' do
lex_xpath('/div[/foo/*]').should == [
[:T_SLASH, nil],
[:T_IDENT, 'div'],
[:T_LBRACK, nil],
[:T_SLASH, nil],
[:T_IDENT, 'foo'],
[:T_SLASH, nil],
[:T_STAR, nil],
[:T_RBRACK, nil]
]
end
# The following are a bunch of examples taken from Wikipedia and the W3 spec
# to see how the lexer handles them.