From 8dd8d7a51958e87f226ecfc9513b3f61aaa6aee3 Mon Sep 17 00:00:00 2001
From: Yorick Peterse <yorickpeterse@gmail.com>
Date: Sun, 1 Jun 2014 19:24:35 +0200
Subject: [PATCH] Basic working XPath lexer.

This doesn't lex everything of the XPath specification just yet and needs more
tests.
---
 lib/oga/xpath/lexer.rl       | 160 ++++++++++++++++++++++++++++++++++-
 spec/oga/xpath/lexer_spec.rb | 113 +++++++++++++++++++++++++
 spec/support/parsing.rb      |  10 +++
 3 files changed, 280 insertions(+), 3 deletions(-)
 create mode 100644 spec/oga/xpath/lexer_spec.rb

diff --git a/lib/oga/xpath/lexer.rl b/lib/oga/xpath/lexer.rl
index f497cd5..4565d2e 100644
--- a/lib/oga/xpath/lexer.rl
+++ b/lib/oga/xpath/lexer.rl
@@ -10,6 +10,13 @@ module Oga
 
       # % fix highlight
 
+      AXIS_MAPPING = {
+        '@'  => 'attribute',
+        '//' => 'descendant-or-self',
+        '..' => 'parent',
+        '.'  => 'self'
+      }
+
       ##
       # @param [String] data The data to lex.
       #
@@ -38,8 +45,8 @@ module Oga
       def lex
         tokens = []
 
-        advance do |token|
-          tokens << token
+        advance do |type, value|
+          tokens << [type, value]
         end
 
         reset
@@ -139,8 +146,155 @@ module Oga
       %%{
         getkey (data.getbyte(p) || 0);
 
+        whitespace = [\n\t ];
+
+        slash  = '/' @{ add_token(:T_SLASH) };
+        lparen = '(' @{ add_token(:T_LPAREN) };
+        rparen = ')' @{ add_token(:T_RPAREN) };
+        comma  = ',' @{ add_token(:T_COMMA) };
+        colon  = ':' @{ add_token(:T_COLON) };
+
+        # Identifiers
+        #
+        # Identifiers are used for element names, namespaces, attribute names,
+        # etc. Identifiers have to start with a letter.
+
+        identifier = [a-zA-Z]+ [a-zA-Z\-_0-9]*;
+
+        action emit_identifier {
+          emit(:T_IDENT, ts, te)
+        }
+
+        # Numbers
+        #
+        # XPath expressions can contain both integers and floats. The W3
+        # specification treats these both as the same type of number. Oga
+        # instead lexes them separately so that we can convert the values to
+        # the corresponding Ruby types (Fixnum and Float).
+
+        integer = digit+;
+        float   = digit+ ('.' digit+)*;
+
+        action emit_integer {
+          value = slice_input(ts, te).to_i
+
+          add_token(:T_INT, value)
+        }
+
+        action emit_float {
+          value = slice_input(ts, te).to_f
+
+          add_token(:T_FLOAT, value)
+        }
+
+        # Strings
+        #
+        # Strings can be single or double quoted. They are mainly used for
+        # attribute values.
+        #
+        dquote = '"';
+        squote = "'";
+
+        string_dquote = (dquote ^dquote+ dquote);
+        string_squote = (squote ^squote+ squote);
+
+        string = string_dquote | string_squote;
+
+        action emit_string {
+          emit(:T_STRING, ts + 1, te - 1)
+        }
+
+        # Full Axes
+        #
+        # XPath axes in their full syntax.
+        #
+        axis_full = ('ancestor'
+          | 'ancestor-or-self'
+          | 'attribute'
+          | 'child'
+          | 'descendant'
+          | 'descendant-or-self'
+          | 'following'
+          | 'following-sibling'
+          | 'namespace'
+          | 'parent'
+          | 'preceding'
+          | 'preceding-sibling'
+          | 'self') '::';
+
+        action emit_axis_full {
+          emit(:T_AXIS, ts, te - 2)
+        }
+
+        # Short Axes
+        #
+        # XPath axes in their abbreviated form. When lexing these are mapped to
+        # their full forms so that the parser doesn't have to take care of
+        # this.
+        #
+        axis_short = '@' | '//' | '..' | '.';
+
+        action emit_axis_short {
+          value = AXIS_MAPPING[slice_input(ts, te)]
+
+          add_token(:T_AXIS, value)
+        }
+
+        # Operators
+        #
+        # Operators can only be used inside predicates due to "div" and "mod"
+        # conflicting with the patterns used for matching identifiers (=
+        # element names and the likes).
+
+        operator = '|'
+          | 'and'
+          | 'or'
+          | '+'
+          | '-'
+          | '*'
+          | 'div'
+          | 'mod'
+          | '='
+          | '!='
+          | '<'
+          | '>'
+          | '<='
+          | '>=';
+
+        action emit_operator {
+          emit(:T_OP, ts, te)
+        }
+
+        # Machine that handles the lexing of data inside an XPath predicate.
+        # When bumping into a "]" the lexer jumps back to the `main` machine.
+        predicate := |*
+          whitespace | slash | lparen | rparen | comma | colon;
+
+          string     => emit_string;
+          integer    => emit_integer;
+          float      => emit_float;
+          axis_full  => emit_axis_full;
+          axis_short => emit_axis_short;
+          operator   => emit_operator;
+          identifier => emit_identifier;
+
+          ']' => {
+            add_token(:T_RBRACK)
+            fnext main;
+          };
+        *|;
+
         main := |*
-          any => { };
+          whitespace | slash | lparen | rparen | comma | colon;
+
+          '[' => {
+            add_token(:T_LBRACK)
+            fnext predicate;
+          };
+
+          axis_full  => emit_axis_full;
+          axis_short => emit_axis_short;
+          identifier => emit_identifier;
         *|;
       }%%
     end # Lexer
diff --git a/spec/oga/xpath/lexer_spec.rb b/spec/oga/xpath/lexer_spec.rb
new file mode 100644
index 0000000..0debdeb
--- /dev/null
+++ b/spec/oga/xpath/lexer_spec.rb
@@ -0,0 +1,113 @@
+require 'spec_helper'
+
+describe Oga::XPath::Lexer do
+  example 'lex a simple expression' do
+    lex_xpath('/foo').should == [[:T_SLASH, nil], [:T_IDENT, 'foo']]
+  end
+
+  example 'lex a function call without arguments' do
+    lex_xpath('count()').should == [
+      [:T_IDENT, 'count'],
+      [:T_LPAREN, nil],
+      [:T_RPAREN, nil]
+    ]
+  end
+
+  example 'lex a function call with a single argument' do
+    lex_xpath('count(foo)').should == [
+      [:T_IDENT, 'count'],
+      [:T_LPAREN, nil],
+      [:T_IDENT, 'foo'],
+      [:T_RPAREN, nil]
+    ]
+  end
+
+  example 'lex a function call with two arguments' do
+    lex_xpath('count(foo, bar)').should == [
+      [:T_IDENT, 'count'],
+      [:T_LPAREN, nil],
+      [:T_IDENT, 'foo'],
+      [:T_COMMA, nil],
+      [:T_IDENT, 'bar'],
+      [:T_RPAREN, nil]
+    ]
+  end
+
+  example 'lex a simple predicate expression' do
+    lex_xpath('/foo[bar]').should == [
+      [:T_SLASH, nil],
+      [:T_IDENT, 'foo'],
+      [:T_LBRACK, nil],
+      [:T_IDENT, 'bar'],
+      [:T_RBRACK, nil]
+    ]
+  end
+
+  example 'lex a predicate that checks for equality' do
+    lex_xpath('/foo[@bar="baz"]').should == [
+      [:T_SLASH, nil],
+      [:T_IDENT, 'foo'],
+      [:T_LBRACK, nil],
+      [:T_AXIS, 'attribute'],
+      [:T_IDENT, 'bar'],
+      [:T_OP, '='],
+      [:T_STRING, 'baz'],
+      [:T_RBRACK, nil]
+    ]
+  end
+
+  example 'lex a predicate that user an integer' do
+    lex_xpath('/foo[1]').should == [
+      [:T_SLASH, nil],
+      [:T_IDENT, 'foo'],
+      [:T_LBRACK, nil],
+      [:T_INT, 1],
+      [:T_RBRACK, nil]
+    ]
+  end
+
+  example 'lex a predicate that uses a float' do
+    lex_xpath('/foo[1.5]').should == [
+      [:T_SLASH, nil],
+      [:T_IDENT, 'foo'],
+      [:T_LBRACK, nil],
+      [:T_FLOAT, 1.5],
+      [:T_RBRACK, nil]
+    ]
+  end
+
+  example 'lex a predicate using a function' do
+    lex_xpath('/foo[bar()]').should == [
+      [:T_SLASH, nil],
+      [:T_IDENT, 'foo'],
+      [:T_LBRACK, nil],
+      [:T_IDENT, 'bar'],
+      [:T_LPAREN, nil],
+      [:T_RPAREN, nil],
+      [:T_RBRACK, nil]
+    ]
+  end
+
+  example 'lex an axis using the full syntax form' do
+    lex_xpath('/parent::node()').should == [
+      [:T_SLASH, nil],
+      [:T_AXIS, 'parent'],
+      [:T_IDENT, 'node'],
+      [:T_LPAREN, nil],
+      [:T_RPAREN, nil]
+    ]
+  end
+
+  example 'lex an axis using the short syntax form' do
+    lex_xpath('/..').should == [[:T_SLASH, nil], [:T_AXIS, 'parent']]
+  end
+
+  example 'lex a node test using a namespace' do
+    lex_xpath('/foo:bar').should == [
+      [:T_SLASH, nil],
+      [:T_IDENT, 'foo'],
+      [:T_COLON, nil],
+      [:T_IDENT, 'bar']
+    ]
+  end
+end
diff --git a/spec/support/parsing.rb b/spec/support/parsing.rb
index 5486708..411fbcb 100644
--- a/spec/support/parsing.rb
+++ b/spec/support/parsing.rb
@@ -22,6 +22,16 @@ module Oga
       return Oga::XML::Lexer.new(input, options).lex
     end
 
+    ##
+    # Lexes an XPath expression.
+    #
+    # @param [String] input
+    # @return [Array]
+    #
+    def lex_xpath(input)
+      return Oga::XPath::Lexer.new(input).lex
+    end
+
     ##
     # Parses the given XML and returns an AST.
     #