Lex input as a sequence of bytes.

Instead of lexing the input as a raw String or as a set of codepoints it's treated as a sequence of bytes. This removes the need of String#[] (replaced by String#byteslice) which in turn reduces the amount of memory needed and speeds up the lexing time. Thanks to @headius and @apeiros for suggesting this and rubber ducking along!
2014-04-17 17:45:05 +02:00 · 2014-04-17 17:45:05 +02:00 · d9fa4b7c45
parent 70516b7447
commit d9fa4b7c45
2 changed files with 15 additions and 5 deletions
--- a/lib/oga/xml/lexer.rl
+++ b/lib/oga/xml/lexer.rl
@ -71,7 +71,7 @@ module Oga
          instance_variable_set("@#{key}", value) if respond_to?(key)
        end

-        @data = data.unpack('U*')
+        @data = data

        reset
      end
@ -90,7 +90,7 @@ module Oga
        @cs       = self.class.lexer_start
        @act      = 0
        @elements = []
-        @eof      = @data.length
+        @eof      = @data.bytesize
        @p        = 0
        @pe       = @eof

@ -191,7 +191,7 @@ module Oga
      # @return [String]
      #
      def text(start = @ts, stop = @te)
-        return @data[start...stop].pack('U*')
+        return @data.byteslice(start, stop - start)
      end

      ##
@ -204,7 +204,6 @@ module Oga
        token = [type, value, @line]

        @block.call(token)
-        #@tokens << token
      end

      ##
@ -263,7 +262,7 @@ module Oga
      %%{
        # Use instance variables for `ts` and friends.
        access @;
-        getkey (@data[@p] || 0);
+        getkey (@data.getbyte(@p) || 0);
        variable p @p;
        variable pe @pe;
        variable eof @eof;
--- a/spec/oga/xml/lexer/multibyte_spec.rb
+++ b/spec/oga/xml/lexer/multibyte_spec.rb
@ -0,0 +1,11 @@
+# encoding: utf-8
+
+require 'spec_helper'
+
+describe Oga::XML::Lexer do
+  context 'multibyte text nodes' do
+    example 'lex a multibyte text node' do
+      lex('쿠키').should == [[:T_TEXT, '쿠키', 1]]
+    end
+  end
+end