Basic working XPath lexer.
This doesn't lex everything of the XPath specification just yet and needs more tests.
This commit is contained in:
parent
a50b76a2d8
commit
8dd8d7a519
|
@ -10,6 +10,13 @@ module Oga
|
|||
|
||||
# % fix highlight
|
||||
|
||||
AXIS_MAPPING = {
|
||||
'@' => 'attribute',
|
||||
'//' => 'descendant-or-self',
|
||||
'..' => 'parent',
|
||||
'.' => 'self'
|
||||
}
|
||||
|
||||
##
|
||||
# @param [String] data The data to lex.
|
||||
#
|
||||
|
@ -38,8 +45,8 @@ module Oga
|
|||
def lex
|
||||
tokens = []
|
||||
|
||||
advance do |token|
|
||||
tokens << token
|
||||
advance do |type, value|
|
||||
tokens << [type, value]
|
||||
end
|
||||
|
||||
reset
|
||||
|
@ -139,8 +146,155 @@ module Oga
|
|||
%%{
|
||||
getkey (data.getbyte(p) || 0);
|
||||
|
||||
whitespace = [\n\t ];
|
||||
|
||||
slash = '/' @{ add_token(:T_SLASH) };
|
||||
lparen = '(' @{ add_token(:T_LPAREN) };
|
||||
rparen = ')' @{ add_token(:T_RPAREN) };
|
||||
comma = ',' @{ add_token(:T_COMMA) };
|
||||
colon = ':' @{ add_token(:T_COLON) };
|
||||
|
||||
# Identifiers
|
||||
#
|
||||
# Identifiers are used for element names, namespaces, attribute names,
|
||||
# etc. Identifiers have to start with a letter.
|
||||
|
||||
identifier = [a-zA-Z]+ [a-zA-Z\-_0-9]*;
|
||||
|
||||
action emit_identifier {
|
||||
emit(:T_IDENT, ts, te)
|
||||
}
|
||||
|
||||
# Numbers
|
||||
#
|
||||
# XPath expressions can contain both integers and floats. The W3
|
||||
# specification treats these both as the same type of number. Oga
|
||||
# instead lexes them separately so that we can convert the values to
|
||||
# the corresponding Ruby types (Fixnum and Float).
|
||||
|
||||
integer = digit+;
|
||||
float = digit+ ('.' digit+)*;
|
||||
|
||||
action emit_integer {
|
||||
value = slice_input(ts, te).to_i
|
||||
|
||||
add_token(:T_INT, value)
|
||||
}
|
||||
|
||||
action emit_float {
|
||||
value = slice_input(ts, te).to_f
|
||||
|
||||
add_token(:T_FLOAT, value)
|
||||
}
|
||||
|
||||
# Strings
|
||||
#
|
||||
# Strings can be single or double quoted. They are mainly used for
|
||||
# attribute values.
|
||||
#
|
||||
dquote = '"';
|
||||
squote = "'";
|
||||
|
||||
string_dquote = (dquote ^dquote+ dquote);
|
||||
string_squote = (squote ^squote+ squote);
|
||||
|
||||
string = string_dquote | string_squote;
|
||||
|
||||
action emit_string {
|
||||
emit(:T_STRING, ts + 1, te - 1)
|
||||
}
|
||||
|
||||
# Full Axes
|
||||
#
|
||||
# XPath axes in their full syntax.
|
||||
#
|
||||
axis_full = ('ancestor'
|
||||
| 'ancestor-or-self'
|
||||
| 'attribute'
|
||||
| 'child'
|
||||
| 'descendant'
|
||||
| 'descendant-or-self'
|
||||
| 'following'
|
||||
| 'following-sibling'
|
||||
| 'namespace'
|
||||
| 'parent'
|
||||
| 'preceding'
|
||||
| 'preceding-sibling'
|
||||
| 'self') '::';
|
||||
|
||||
action emit_axis_full {
|
||||
emit(:T_AXIS, ts, te - 2)
|
||||
}
|
||||
|
||||
# Short Axes
|
||||
#
|
||||
# XPath axes in their abbreviated form. When lexing these are mapped to
|
||||
# their full forms so that the parser doesn't have to take care of
|
||||
# this.
|
||||
#
|
||||
axis_short = '@' | '//' | '..' | '.';
|
||||
|
||||
action emit_axis_short {
|
||||
value = AXIS_MAPPING[slice_input(ts, te)]
|
||||
|
||||
add_token(:T_AXIS, value)
|
||||
}
|
||||
|
||||
# Operators
|
||||
#
|
||||
# Operators can only be used inside predicates due to "div" and "mod"
|
||||
# conflicting with the patterns used for matching identifiers (=
|
||||
# element names and the likes).
|
||||
|
||||
operator = '|'
|
||||
| 'and'
|
||||
| 'or'
|
||||
| '+'
|
||||
| '-'
|
||||
| '*'
|
||||
| 'div'
|
||||
| 'mod'
|
||||
| '='
|
||||
| '!='
|
||||
| '<'
|
||||
| '>'
|
||||
| '<='
|
||||
| '>=';
|
||||
|
||||
action emit_operator {
|
||||
emit(:T_OP, ts, te)
|
||||
}
|
||||
|
||||
# Machine that handles the lexing of data inside an XPath predicate.
|
||||
# When bumping into a "]" the lexer jumps back to the `main` machine.
|
||||
predicate := |*
|
||||
whitespace | slash | lparen | rparen | comma | colon;
|
||||
|
||||
string => emit_string;
|
||||
integer => emit_integer;
|
||||
float => emit_float;
|
||||
axis_full => emit_axis_full;
|
||||
axis_short => emit_axis_short;
|
||||
operator => emit_operator;
|
||||
identifier => emit_identifier;
|
||||
|
||||
']' => {
|
||||
add_token(:T_RBRACK)
|
||||
fnext main;
|
||||
};
|
||||
*|;
|
||||
|
||||
main := |*
|
||||
any => { };
|
||||
whitespace | slash | lparen | rparen | comma | colon;
|
||||
|
||||
'[' => {
|
||||
add_token(:T_LBRACK)
|
||||
fnext predicate;
|
||||
};
|
||||
|
||||
axis_full => emit_axis_full;
|
||||
axis_short => emit_axis_short;
|
||||
identifier => emit_identifier;
|
||||
*|;
|
||||
}%%
|
||||
end # Lexer
|
||||
|
|
|
@ -0,0 +1,113 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::XPath::Lexer do
|
||||
example 'lex a simple expression' do
|
||||
lex_xpath('/foo').should == [[:T_SLASH, nil], [:T_IDENT, 'foo']]
|
||||
end
|
||||
|
||||
example 'lex a function call without arguments' do
|
||||
lex_xpath('count()').should == [
|
||||
[:T_IDENT, 'count'],
|
||||
[:T_LPAREN, nil],
|
||||
[:T_RPAREN, nil]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex a function call with a single argument' do
|
||||
lex_xpath('count(foo)').should == [
|
||||
[:T_IDENT, 'count'],
|
||||
[:T_LPAREN, nil],
|
||||
[:T_IDENT, 'foo'],
|
||||
[:T_RPAREN, nil]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex a function call with two arguments' do
|
||||
lex_xpath('count(foo, bar)').should == [
|
||||
[:T_IDENT, 'count'],
|
||||
[:T_LPAREN, nil],
|
||||
[:T_IDENT, 'foo'],
|
||||
[:T_COMMA, nil],
|
||||
[:T_IDENT, 'bar'],
|
||||
[:T_RPAREN, nil]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex a simple predicate expression' do
|
||||
lex_xpath('/foo[bar]').should == [
|
||||
[:T_SLASH, nil],
|
||||
[:T_IDENT, 'foo'],
|
||||
[:T_LBRACK, nil],
|
||||
[:T_IDENT, 'bar'],
|
||||
[:T_RBRACK, nil]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex a predicate that checks for equality' do
|
||||
lex_xpath('/foo[@bar="baz"]').should == [
|
||||
[:T_SLASH, nil],
|
||||
[:T_IDENT, 'foo'],
|
||||
[:T_LBRACK, nil],
|
||||
[:T_AXIS, 'attribute'],
|
||||
[:T_IDENT, 'bar'],
|
||||
[:T_OP, '='],
|
||||
[:T_STRING, 'baz'],
|
||||
[:T_RBRACK, nil]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex a predicate that user an integer' do
|
||||
lex_xpath('/foo[1]').should == [
|
||||
[:T_SLASH, nil],
|
||||
[:T_IDENT, 'foo'],
|
||||
[:T_LBRACK, nil],
|
||||
[:T_INT, 1],
|
||||
[:T_RBRACK, nil]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex a predicate that uses a float' do
|
||||
lex_xpath('/foo[1.5]').should == [
|
||||
[:T_SLASH, nil],
|
||||
[:T_IDENT, 'foo'],
|
||||
[:T_LBRACK, nil],
|
||||
[:T_FLOAT, 1.5],
|
||||
[:T_RBRACK, nil]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex a predicate using a function' do
|
||||
lex_xpath('/foo[bar()]').should == [
|
||||
[:T_SLASH, nil],
|
||||
[:T_IDENT, 'foo'],
|
||||
[:T_LBRACK, nil],
|
||||
[:T_IDENT, 'bar'],
|
||||
[:T_LPAREN, nil],
|
||||
[:T_RPAREN, nil],
|
||||
[:T_RBRACK, nil]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex an axis using the full syntax form' do
|
||||
lex_xpath('/parent::node()').should == [
|
||||
[:T_SLASH, nil],
|
||||
[:T_AXIS, 'parent'],
|
||||
[:T_IDENT, 'node'],
|
||||
[:T_LPAREN, nil],
|
||||
[:T_RPAREN, nil]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex an axis using the short syntax form' do
|
||||
lex_xpath('/..').should == [[:T_SLASH, nil], [:T_AXIS, 'parent']]
|
||||
end
|
||||
|
||||
example 'lex a node test using a namespace' do
|
||||
lex_xpath('/foo:bar').should == [
|
||||
[:T_SLASH, nil],
|
||||
[:T_IDENT, 'foo'],
|
||||
[:T_COLON, nil],
|
||||
[:T_IDENT, 'bar']
|
||||
]
|
||||
end
|
||||
end
|
|
@ -22,6 +22,16 @@ module Oga
|
|||
return Oga::XML::Lexer.new(input, options).lex
|
||||
end
|
||||
|
||||
##
|
||||
# Lexes an XPath expression.
|
||||
#
|
||||
# @param [String] input
|
||||
# @return [Array]
|
||||
#
|
||||
def lex_xpath(input)
|
||||
return Oga::XPath::Lexer.new(input).lex
|
||||
end
|
||||
|
||||
##
|
||||
# Parses the given XML and returns an AST.
|
||||
#
|
||||
|
|
Loading…
Reference in New Issue