Support for lexing XPath wildcard expressions.

To support this we need to require whitespace around the "*" operator. This is not ideal but it will do for now.
2014-06-01 23:01:24 +02:00 · 2014-06-01 23:01:24 +02:00 · 54de2df0c7
parent 48bf1a0628
commit 54de2df0c7
2 changed files with 68 additions and 23 deletions
--- a/lib/oga/xpath/lexer.rl
+++ b/lib/oga/xpath/lexer.rl
@ -3,13 +3,19 @@
 module Oga
  module XPath
    ##
-    # Ragel lexer for lexing XPath queries.
+    # Ragel lexer for lexing XPath expressions.
    #
    class Lexer
      %% write data;
      # % fix highlight
      ##
      # Maps certain XPath axes written in their short form to their long form
      # equivalents.
      #
      # @return [Hash]
      #
      AXIS_MAPPING = {
        '@'  => 'attribute',
        '//' => 'descendant-or-self',
@ -22,23 +28,11 @@ module Oga
      #
      def initialize(data)
        @data = data
        reset
      end
      ##
      # Resets the internal state of the lexer.
      #
      def reset
      end
      ##
      # Gathers all the tokens for the input and returns them as an Array.
      #
      # This method resets the internal state of the lexer after consuming the
      # input.
      #
      # @see [#advance]
      # @return [Array]
      #
@ -49,8 +43,6 @@ module Oga
          tokens << [type, value]
        end
        reset
        return tokens
      end
@ -67,8 +59,6 @@ module Oga
      # This method stores the supplied block in `@block` and resets it after
      # the lexer loop has finished.
      #
      # This method does *not* reset the internal state of the lexer.
      #
      # @param [String] data The String to consume.
      # @return [Array]
      #
@ -153,6 +143,7 @@ module Oga
        rparen = ')' @{ add_token(:T_RPAREN) };
        comma  = ',' @{ add_token(:T_COMMA) };
        colon  = ':' @{ add_token(:T_COLON) };
        star   = '*' @{ add_token(:T_STAR) };
        # Identifiers
        #
@ -250,8 +241,6 @@ module Oga
          | 'and'
          | 'or'
          | '+'
          | '-'
          | '*'
          | 'div'
          | 'mod'
          | '='
@ -261,21 +250,36 @@ module Oga
          | '<='
          | '>=';
        # These operators require whitespace around them in order to be lexed
        # as operators. This is due to "-" being allowed in node names and "*"
        # also being used as a whildcard.
        #
        # THINK: relying on whitespace is a rather fragile solution, even
        # though the W3 actually recommends this for the "-" operator. Perhaps
        # there's a better way of doing this.
        space_operator = space ('*' | '-') space;
        action emit_operator {
          emit(:T_OP, ts, te)
        }
        action emit_space_operator {
          emit(:T_OP, ts + 1, te - 1)
        }
        # Machine that handles the lexing of data inside an XPath predicate.
        # When bumping into a "]" the lexer jumps back to the `main` machine.
        predicate := |*
-          whitespace | slash | lparen | rparen | comma | colon;
+          whitespace | slash | lparen | rparen | comma | colon | star;
          operator       => emit_operator;
          space_operator => emit_space_operator;
          string     => emit_string;
          integer    => emit_integer;
          float      => emit_float;
          axis_full  => emit_axis_full;
          axis_short => emit_axis_short;
          operator   => emit_operator;
          identifier => emit_identifier;
          ']' => {
@ -285,7 +289,7 @@ module Oga
        *|;
        main := |*
-          whitespace | slash | lparen | rparen | comma | colon;
+          whitespace | slash | lparen | rparen | comma | colon | star;
          '[' => {
            add_token(:T_LBRACK)
--- a/spec/oga/xpath/lexer_spec.rb
+++ b/spec/oga/xpath/lexer_spec.rb
@ -111,7 +111,20 @@ describe Oga::XPath::Lexer do
    ]
  end
-  example 'lex a predicate expression using an operator' do
+  example 'lex a whildcard node test' do
    lex_xpath('/*').should == [[:T_SLASH, nil], [:T_STAR, nil]]
  end
  example 'lex a wildcard node test for a namespace' do
    lex_xpath('/*:foo').should == [
      [:T_SLASH, nil],
      [:T_STAR, nil],
      [:T_COLON, nil],
      [:T_IDENT, 'foo']
    ]
  end
  example 'lex a predicate expression using the div operator' do
    lex_xpath('/div[@number=4 div 2]').should == [
      [:T_SLASH, nil],
      [:T_IDENT, 'div'],
@ -126,6 +139,21 @@ describe Oga::XPath::Lexer do
    ]
  end
  example 'lex a predicate expression using the * operator' do
    lex_xpath('/div[@number=4 * 2]').should == [
      [:T_SLASH, nil],
      [:T_IDENT, 'div'],
      [:T_LBRACK, nil],
      [:T_AXIS, 'attribute'],
      [:T_IDENT, 'number'],
      [:T_OP, '='],
      [:T_INT, 4],
      [:T_OP, '*'],
      [:T_INT, 2],
      [:T_RBRACK, nil]
    ]
  end
  example 'lex a predicate expression using axes' do
    lex_xpath('/div[/foo/bar]').should == [
      [:T_SLASH, nil],
@ -139,6 +167,19 @@ describe Oga::XPath::Lexer do
    ]
  end
  example 'lex a predicate expression using a wildcard' do
    lex_xpath('/div[/foo/*]').should == [
      [:T_SLASH, nil],
      [:T_IDENT, 'div'],
      [:T_LBRACK, nil],
      [:T_SLASH, nil],
      [:T_IDENT, 'foo'],
      [:T_SLASH, nil],
      [:T_STAR, nil],
      [:T_RBRACK, nil]
    ]
  end
  # The following are a bunch of examples taken from Wikipedia and the W3 spec
  # to see how the lexer handles them.