oga/lib/oga/xpath/lexer.rl

%%machine xpath_lexer; # %

module Oga
  module XPath
    # Lexer for turning XPath expressions into a set of tokens. Tokens are
    # returned as arrays with every array having two values:
    #
    # 1. The token type as a symbol
    # 2. The token value or nil if there is no value
    #
    # Basic usage of this lexer is as following:
    #
    #     lexer  = Oga::XPath::Lexer.new('//foo/bar')
    #     tokens = lexer.lex
    #
    # Alternatively you can stream tokens instead of returning them as a whole:
    #
    #     lexer = Oga::XPath::Lexer.new('//foo/bar')
    #
    #     lexer.advance do |type, value|
    #
    #     end
    #
    # Unlike the XML lexer the XPath lexer does not support IO instances, it can
    # only lex strings.
    #
    # ## Thread Safety
    #
    # This class keeps track of an internal state. As a result it's not safe to
    # share a single instance between multiple threads. However, you're free to
    # use separate instances per thread as there is no global (= class level)
    # shared state.
    #
    # @api private
    class Lexer
      %% write data;

      # % fix highlight

      # Maps certain XPath axes written in their short form to their long form
      # equivalents.
      #
      # @return [Hash]
      AXIS_MAPPING = {
        '@'  => 'attribute',
        '//' => 'descendant-or-self',
        '..' => 'parent',
        '.'  => 'self'
      }

      # Axes that require a separate `node()` call to be emitted.
      #
      # @return [Array]
      AXIS_EMIT_NODE = %w{descendant-or-self parent self}

      # Axes that require an extra T_SLASH token to be emitted.
      #
      # @return [Array]
      AXIS_EMIT_EXTRA_SLASH = %w{descendant-or-self}

      # @param [String] data The data to lex.
      def initialize(data)
        @data = data
      end

      # Gathers all the tokens for the input and returns them as an Array.
      #
      # @see [#advance]
      # @return [Array]
      def lex
        tokens = []

        advance do |type, value|
          tokens << [type, value]
        end

        return tokens
      end

      # Advances through the input and generates the corresponding tokens. Each
      # token is yielded to the supplied block.
      #
      # Each token is an Array in the following format:
      #
      #     [TYPE, VALUE]
      #
      # The type is a symbol, the value is either nil or a String.
      #
      # This method stores the supplied block in `@block` and resets it after
      # the lexer loop has finished.
      #
      # @see [#add_token]
      def advance(&block)
        @block = block

        data  = @data # saves ivar lookups while lexing.
        ts    = nil
        te    = nil
        stack = []
        top   = 0
        cs    = self.class.xpath_lexer_start
        act   = 0
        eof   = @data.bytesize
        p     = 0
        pe    = eof

        _xpath_lexer_eof_trans          = self.class.send(:_xpath_lexer_eof_trans)
        _xpath_lexer_from_state_actions = self.class.send(:_xpath_lexer_from_state_actions)
        _xpath_lexer_index_offsets      = self.class.send(:_xpath_lexer_index_offsets)
        _xpath_lexer_indicies           = self.class.send(:_xpath_lexer_indicies)
        _xpath_lexer_key_spans          = self.class.send(:_xpath_lexer_key_spans)
        _xpath_lexer_to_state_actions   = self.class.send(:_xpath_lexer_to_state_actions)
        _xpath_lexer_trans_actions      = self.class.send(:_xpath_lexer_trans_actions)
        _xpath_lexer_trans_keys         = self.class.send(:_xpath_lexer_trans_keys)
        _xpath_lexer_trans_targs        = self.class.send(:_xpath_lexer_trans_targs)

        %% write exec;

        # % fix highlight
      ensure
        @block = nil
      end

      private

      # Emits a token of which the value is based on the supplied start/stop
      # position.
      #
      # @param [Symbol] type The token type.
      # @param [Fixnum] start
      # @param [Fixnum] stop
      #
      # @see [#text]
      # @see [#add_token]
      def emit(type, start, stop)
        value = slice_input(start, stop)

        add_token(type, value)
      end

      # Returns the text between the specified start and stop position.
      #
      # @param [Fixnum] start
      # @param [Fixnum] stop
      # @return [String]
      def slice_input(start, stop)
        return @data.byteslice(start, stop - start)
      end

      # Yields a new token to the supplied block.
      #
      # @param [Symbol] type The token type.
      # @param [String] value The token value.
      #
      # @yieldparam [Symbol] type
      # @yieldparam [String|NilClass] value
      def add_token(type, value = nil)
        @block.call(type, value)
      end

      %%{
        getkey (data.getbyte(p) || 0);

        whitespace = [\n\t ];

        slash  = '/' @{ add_token(:T_SLASH) };
        lparen = '(' @{ add_token(:T_LPAREN) };
        rparen = ')' @{ add_token(:T_RPAREN) };
        comma  = ',' @{ add_token(:T_COMMA) };
        colon  = ':' @{ add_token(:T_COLON) };
        lbrack = '[' @{ add_token(:T_LBRACK) };
        rbrack = ']' @{ add_token(:T_RBRACK) };

        # Identifiers
        #
        # Identifiers are used for element names, namespaces, attribute names,
        # etc. Identifiers have to start with a letter.

        unicode = any - ascii;

        unicode_or_ascii = (unicode | [a-zA-Z\-_0-9\.])*;

        identifier = '*' | (unicode | [a-zA-Z_]) unicode_or_ascii ;

        action emit_identifier {
          emit(:T_IDENT, ts, te)
        }

        # Numbers
        #
        # XPath expressions can contain both integers and floats. The W3
        # specification treats these both as the same type of number. Oga
        # instead lexes them separately so that we can convert the values to
        # the corresponding Ruby types (Fixnum and Float).

        integer = ('-' | '+')* digit+;
        float   = ('-' | '+')* digit+ ('.' digit+)*;

        action emit_integer {
          value = slice_input(ts, te).to_i

          add_token(:T_INT, value)
        }

        action emit_float {
          value = slice_input(ts, te).to_f

          add_token(:T_FLOAT, value)
        }

        # Strings
        #
        # Strings can be single or double quoted. They are mainly used for
        # attribute values.
        dquote = '"';
        squote = "'";

        string_dquote = (dquote ^dquote* dquote);
        string_squote = (squote ^squote* squote);

        string = string_dquote | string_squote;

        action emit_string {
          emit(:T_STRING, ts + 1, te - 1)
        }

        # Full Axes
        #
        # XPath axes in their full syntax.
        axis_full = ('ancestor'
          | 'ancestor-or-self'
          | 'attribute'
          | 'child'
          | 'descendant'
          | 'descendant-or-self'
          | 'following'
          | 'following-sibling'
          | 'namespace'
          | 'parent'
          | 'preceding'
          | 'preceding-sibling'
          | 'self') '::';

        action emit_axis_full {
          emit(:T_AXIS, ts, te - 2)
        }

        # Short Axes
        #
        # XPath axes in their abbreviated form. When lexing these are mapped to
        # their full forms so that the parser doesn't have to take care of
        # this.
        axis_short = '@' | '//' | '..' | '.';

        action emit_axis_short {
          value = AXIS_MAPPING[slice_input(ts, te)]

          add_token(:T_AXIS, value)

          # Short axes that use node() as their default, implicit test. This is
          # added on lexer level to make it easier to handle these cases on
          # parser/evaluator level.
          if AXIS_EMIT_NODE.include?(value)
            add_token(:T_TYPE_TEST, 'node')

            if AXIS_EMIT_EXTRA_SLASH.include?(value) and te != eof
              add_token(:T_SLASH)
            end
          end
        }

        # Operators
        #
        # Operators can only be used inside predicates due to "div" and "mod"
        # conflicting with the patterns used for matching identifiers (=
        # element names and the likes).

        op_pipe = '|'  %{ add_token(:T_PIPE) };
        op_plus = '+'  %{ add_token(:T_ADD) };
        op_eq   = '='  %{ add_token(:T_EQ) };
        op_neq  = '!=' %{ add_token(:T_NEQ) };
        op_lt   = '<'  %{ add_token(:T_LT) };
        op_gt   = '>'  %{ add_token(:T_GT) };
        op_lte  = '<=' %{ add_token(:T_LTE) };
        op_gte  = '>=' %{ add_token(:T_GTE) };

        # These operators require whitespace around them in order to be lexed
        # as operators. This is due to "-" being allowed in node names and "*"
        # also being used as a whildcard.
        #
        # THINK: relying on whitespace is a rather fragile solution, even
        # though the W3 actually recommends this for the "-" operator. Perhaps
        # there's a better way of doing this.

        op_and = ' and ' %{ add_token(:T_AND) };
        op_or  = ' or '  %{ add_token(:T_OR) };
        op_div = ' div ' %{ add_token(:T_DIV) };
        op_mod = ' mod ' %{ add_token(:T_MOD) };
        op_mul = ' * '   %{ add_token(:T_MUL) };
        op_sub = ' - '   %{ add_token(:T_SUB) };

        operator = op_pipe
          | op_and
          | op_or
          | op_plus
          | op_div
          | op_mod
          | op_eq
          | op_neq
          | op_lt
          | op_gt
          | op_lte
          | op_gte
          | op_mul
          | op_sub
          ;

        # Node type tests
        #
        # While these look like functions they are actually node type tests. For
        # example, comment() matches all comment nodes.
        #
        # See http://www.w3.org/TR/xpath/#NT-NodeType for more information.

        type_test = (
          'comment' |
          'text' |
          'processing-instruction' |
          'node'
        ) '()';

        action emit_type_test {
          emit(:T_TYPE_TEST, ts, te - 2)
        }

        # Variables
        #
        # XPath 1.0 allows the use of variables in expressions. Oddly enough you
        # can not assign variables in an expression, you can only refer to them.
        # This means that libraries themselves have to expose an interface for
        # setting variables.
        var = '$' identifier;

        action emit_variable {
          emit(:T_VAR, ts + 1, te)
        }

        main := |*
          operator;
          whitespace | slash | lparen | rparen | comma | colon | lbrack | rbrack;

          type_test  => emit_type_test;
          var        => emit_variable;
          string     => emit_string;
          integer    => emit_integer;
          float      => emit_float;
          axis_full  => emit_axis_full;
          axis_short => emit_axis_short;
          identifier => emit_identifier;
        *|;
      }%%
    end # Lexer
  end # XPath
end # Oga