From 8dd8d7a51958e87f226ecfc9513b3f61aaa6aee3 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Sun, 1 Jun 2014 19:24:35 +0200 Subject: [PATCH] Basic working XPath lexer. This doesn't lex everything of the XPath specification just yet and needs more tests. --- lib/oga/xpath/lexer.rl | 160 ++++++++++++++++++++++++++++++++++- spec/oga/xpath/lexer_spec.rb | 113 +++++++++++++++++++++++++ spec/support/parsing.rb | 10 +++ 3 files changed, 280 insertions(+), 3 deletions(-) create mode 100644 spec/oga/xpath/lexer_spec.rb diff --git a/lib/oga/xpath/lexer.rl b/lib/oga/xpath/lexer.rl index f497cd5..4565d2e 100644 --- a/lib/oga/xpath/lexer.rl +++ b/lib/oga/xpath/lexer.rl @@ -10,6 +10,13 @@ module Oga # % fix highlight + AXIS_MAPPING = { + '@' => 'attribute', + '//' => 'descendant-or-self', + '..' => 'parent', + '.' => 'self' + } + ## # @param [String] data The data to lex. # @@ -38,8 +45,8 @@ module Oga def lex tokens = [] - advance do |token| - tokens << token + advance do |type, value| + tokens << [type, value] end reset @@ -139,8 +146,155 @@ module Oga %%{ getkey (data.getbyte(p) || 0); + whitespace = [\n\t ]; + + slash = '/' @{ add_token(:T_SLASH) }; + lparen = '(' @{ add_token(:T_LPAREN) }; + rparen = ')' @{ add_token(:T_RPAREN) }; + comma = ',' @{ add_token(:T_COMMA) }; + colon = ':' @{ add_token(:T_COLON) }; + + # Identifiers + # + # Identifiers are used for element names, namespaces, attribute names, + # etc. Identifiers have to start with a letter. + + identifier = [a-zA-Z]+ [a-zA-Z\-_0-9]*; + + action emit_identifier { + emit(:T_IDENT, ts, te) + } + + # Numbers + # + # XPath expressions can contain both integers and floats. The W3 + # specification treats these both as the same type of number. Oga + # instead lexes them separately so that we can convert the values to + # the corresponding Ruby types (Fixnum and Float). + + integer = digit+; + float = digit+ ('.' digit+)*; + + action emit_integer { + value = slice_input(ts, te).to_i + + add_token(:T_INT, value) + } + + action emit_float { + value = slice_input(ts, te).to_f + + add_token(:T_FLOAT, value) + } + + # Strings + # + # Strings can be single or double quoted. They are mainly used for + # attribute values. + # + dquote = '"'; + squote = "'"; + + string_dquote = (dquote ^dquote+ dquote); + string_squote = (squote ^squote+ squote); + + string = string_dquote | string_squote; + + action emit_string { + emit(:T_STRING, ts + 1, te - 1) + } + + # Full Axes + # + # XPath axes in their full syntax. + # + axis_full = ('ancestor' + | 'ancestor-or-self' + | 'attribute' + | 'child' + | 'descendant' + | 'descendant-or-self' + | 'following' + | 'following-sibling' + | 'namespace' + | 'parent' + | 'preceding' + | 'preceding-sibling' + | 'self') '::'; + + action emit_axis_full { + emit(:T_AXIS, ts, te - 2) + } + + # Short Axes + # + # XPath axes in their abbreviated form. When lexing these are mapped to + # their full forms so that the parser doesn't have to take care of + # this. + # + axis_short = '@' | '//' | '..' | '.'; + + action emit_axis_short { + value = AXIS_MAPPING[slice_input(ts, te)] + + add_token(:T_AXIS, value) + } + + # Operators + # + # Operators can only be used inside predicates due to "div" and "mod" + # conflicting with the patterns used for matching identifiers (= + # element names and the likes). + + operator = '|' + | 'and' + | 'or' + | '+' + | '-' + | '*' + | 'div' + | 'mod' + | '=' + | '!=' + | '<' + | '>' + | '<=' + | '>='; + + action emit_operator { + emit(:T_OP, ts, te) + } + + # Machine that handles the lexing of data inside an XPath predicate. + # When bumping into a "]" the lexer jumps back to the `main` machine. + predicate := |* + whitespace | slash | lparen | rparen | comma | colon; + + string => emit_string; + integer => emit_integer; + float => emit_float; + axis_full => emit_axis_full; + axis_short => emit_axis_short; + operator => emit_operator; + identifier => emit_identifier; + + ']' => { + add_token(:T_RBRACK) + fnext main; + }; + *|; + main := |* - any => { }; + whitespace | slash | lparen | rparen | comma | colon; + + '[' => { + add_token(:T_LBRACK) + fnext predicate; + }; + + axis_full => emit_axis_full; + axis_short => emit_axis_short; + identifier => emit_identifier; *|; }%% end # Lexer diff --git a/spec/oga/xpath/lexer_spec.rb b/spec/oga/xpath/lexer_spec.rb new file mode 100644 index 0000000..0debdeb --- /dev/null +++ b/spec/oga/xpath/lexer_spec.rb @@ -0,0 +1,113 @@ +require 'spec_helper' + +describe Oga::XPath::Lexer do + example 'lex a simple expression' do + lex_xpath('/foo').should == [[:T_SLASH, nil], [:T_IDENT, 'foo']] + end + + example 'lex a function call without arguments' do + lex_xpath('count()').should == [ + [:T_IDENT, 'count'], + [:T_LPAREN, nil], + [:T_RPAREN, nil] + ] + end + + example 'lex a function call with a single argument' do + lex_xpath('count(foo)').should == [ + [:T_IDENT, 'count'], + [:T_LPAREN, nil], + [:T_IDENT, 'foo'], + [:T_RPAREN, nil] + ] + end + + example 'lex a function call with two arguments' do + lex_xpath('count(foo, bar)').should == [ + [:T_IDENT, 'count'], + [:T_LPAREN, nil], + [:T_IDENT, 'foo'], + [:T_COMMA, nil], + [:T_IDENT, 'bar'], + [:T_RPAREN, nil] + ] + end + + example 'lex a simple predicate expression' do + lex_xpath('/foo[bar]').should == [ + [:T_SLASH, nil], + [:T_IDENT, 'foo'], + [:T_LBRACK, nil], + [:T_IDENT, 'bar'], + [:T_RBRACK, nil] + ] + end + + example 'lex a predicate that checks for equality' do + lex_xpath('/foo[@bar="baz"]').should == [ + [:T_SLASH, nil], + [:T_IDENT, 'foo'], + [:T_LBRACK, nil], + [:T_AXIS, 'attribute'], + [:T_IDENT, 'bar'], + [:T_OP, '='], + [:T_STRING, 'baz'], + [:T_RBRACK, nil] + ] + end + + example 'lex a predicate that user an integer' do + lex_xpath('/foo[1]').should == [ + [:T_SLASH, nil], + [:T_IDENT, 'foo'], + [:T_LBRACK, nil], + [:T_INT, 1], + [:T_RBRACK, nil] + ] + end + + example 'lex a predicate that uses a float' do + lex_xpath('/foo[1.5]').should == [ + [:T_SLASH, nil], + [:T_IDENT, 'foo'], + [:T_LBRACK, nil], + [:T_FLOAT, 1.5], + [:T_RBRACK, nil] + ] + end + + example 'lex a predicate using a function' do + lex_xpath('/foo[bar()]').should == [ + [:T_SLASH, nil], + [:T_IDENT, 'foo'], + [:T_LBRACK, nil], + [:T_IDENT, 'bar'], + [:T_LPAREN, nil], + [:T_RPAREN, nil], + [:T_RBRACK, nil] + ] + end + + example 'lex an axis using the full syntax form' do + lex_xpath('/parent::node()').should == [ + [:T_SLASH, nil], + [:T_AXIS, 'parent'], + [:T_IDENT, 'node'], + [:T_LPAREN, nil], + [:T_RPAREN, nil] + ] + end + + example 'lex an axis using the short syntax form' do + lex_xpath('/..').should == [[:T_SLASH, nil], [:T_AXIS, 'parent']] + end + + example 'lex a node test using a namespace' do + lex_xpath('/foo:bar').should == [ + [:T_SLASH, nil], + [:T_IDENT, 'foo'], + [:T_COLON, nil], + [:T_IDENT, 'bar'] + ] + end +end diff --git a/spec/support/parsing.rb b/spec/support/parsing.rb index 5486708..411fbcb 100644 --- a/spec/support/parsing.rb +++ b/spec/support/parsing.rb @@ -22,6 +22,16 @@ module Oga return Oga::XML::Lexer.new(input, options).lex end + ## + # Lexes an XPath expression. + # + # @param [String] input + # @return [Array] + # + def lex_xpath(input) + return Oga::XPath::Lexer.new(input).lex + end + ## # Parses the given XML and returns an AST. #