diff --git a/.gitignore b/.gitignore index b1b1efe..4078dde 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,8 @@ pkg Gemfile.lock lib/oga/xml/parser.rb +lib/oga/xpath/lexer.rb + lib/liboga.* benchmark/fixtures/big.xml diff --git a/Rakefile b/Rakefile index ab553c4..1313afc 100644 --- a/Rakefile +++ b/Rakefile @@ -24,6 +24,7 @@ CLEAN.include( 'coverage', 'yardoc', PARSER_OUTPUT, + 'lib/oga/xpath/lexer.rb', 'benchmark/fixtures/big.xml', 'profile/samples/**/*.txt', 'lib/liboga.*', diff --git a/lib/oga.rb b/lib/oga.rb index 5258da2..44ae6e6 100644 --- a/lib/oga.rb +++ b/lib/oga.rb @@ -22,3 +22,5 @@ require_relative 'oga/xml/xml_declaration' require_relative 'oga/xml/doctype' require_relative 'oga/html/parser' + +require_relative 'oga/xpath/lexer' diff --git a/lib/oga/xpath/lexer.rl b/lib/oga/xpath/lexer.rl new file mode 100644 index 0000000..1e9e1d7 --- /dev/null +++ b/lib/oga/xpath/lexer.rl @@ -0,0 +1,150 @@ +%%machine xpath_lexer; # % + +module Oga + module XPath + ## + # Ragel lexer for lexing XPath queries. + # + class Lexer + %% write data; + + # % fix highlight + + ## + # @param [String] data The data to lex. + # + def initialize(data) + @data = data + + reset + end + + ## + # Resets the internal state of the lexer. + # + def reset + + end + + ## + # Gathers all the tokens for the input and returns them as an Array. + # + # This method resets the internal state of the lexer after consuming the + # input. + # + # @see [#advance] + # @return [Array] + # + def lex + tokens = [] + + advance do |token| + tokens << token + end + + reset + + return tokens + end + + ## + # Advances through the input and generates the corresponding tokens. Each + # token is yielded to the supplied block. + # + # Each token is an Array in the following format: + # + # [TYPE, VALUE] + # + # The type is a symbol, the value is either nil or a String. + # + # This method stores the supplied block in `@block` and resets it after + # the lexer loop has finished. + # + # This method does *not* reset the internal state of the lexer. + # + # @param [String] data The String to consume. + # @return [Array] + # + def advance(&block) + @block = block + + data = @data # saves ivar lookups while lexing. + ts = nil + te = nil + stack = [] + top = 0 + cs = self.class.xpath_lexer_start + act = 0 + eof = @data.bytesize + p = 0 + pe = eof + + #_xpath_lexer_eof_trans = self.class.send(:_xpath_lexer_eof_trans) + _xpath_lexer_from_state_actions = self.class.send(:_xpath_lexer_from_state_actions) + _xpath_lexer_index_offsets = self.class.send(:_xpath_lexer_index_offsets) + _xpath_lexer_indicies = self.class.send(:_xpath_lexer_indicies) + _xpath_lexer_key_spans = self.class.send(:_xpath_lexer_key_spans) + _xpath_lexer_to_state_actions = self.class.send(:_xpath_lexer_to_state_actions) + _xpath_lexer_trans_actions = self.class.send(:_xpath_lexer_trans_actions) + _xpath_lexer_trans_keys = self.class.send(:_xpath_lexer_trans_keys) + _xpath_lexer_trans_targs = self.class.send(:_xpath_lexer_trans_targs) + + %% write exec; + + # % fix highlight + ensure + @block = nil + end + + private + + ## + # Emits a token who's value is based on the supplied start/stop position. + # + # @param [Symbol] type The token type. + # @param [Fixnum] start + # @param [Fixnum] stop + # + # @see #text + # @see #add_token + # + def emit(type, start, stop) + value = text(start, stop) + + add_token(type, value) + end + + ## + # Returns the text of the current buffer based on the supplied start and + # stop position. + # + # @param [Fixnum] start + # @param [Fixnum] stop + # @return [String] + # + def text(start, stop) + return @data.byteslice(start, stop - start) + end + + ## + # Adds a token with the given type and value to the list. + # + # @param [Symbol] type The token type. + # @param [String] value The token value. + # + def add_token(type, value = nil) + token = [type, value, @line] + + @block.call(token) + end + + %%{ + getkey (data.getbyte(p) || 0); + + main := |* + any => { }; + *|; + }%% + end # Lexer + end # XPath +end # Oga diff --git a/task/lexer.rake b/task/lexer.rake index d8eeec7..e4bc98f 100644 --- a/task/lexer.rake +++ b/task/lexer.rake @@ -27,5 +27,6 @@ end desc 'Generates the lexers' multitask :lexer => [ 'ext/c/lexer.c', - 'ext/java/org/liboga/xml/Lexer.java' + 'ext/java/org/liboga/xml/Lexer.java', + 'lib/oga/xpath/lexer.rb' ]