From d9fa4b7c4535894500ec2833284522405b30cbac Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Thu, 17 Apr 2014 17:45:05 +0200 Subject: [PATCH] Lex input as a sequence of bytes. Instead of lexing the input as a raw String or as a set of codepoints it's treated as a sequence of bytes. This removes the need of String#[] (replaced by String#byteslice) which in turn reduces the amount of memory needed and speeds up the lexing time. Thanks to @headius and @apeiros for suggesting this and rubber ducking along! --- lib/oga/xml/lexer.rl | 9 ++++----- spec/oga/xml/lexer/multibyte_spec.rb | 11 +++++++++++ 2 files changed, 15 insertions(+), 5 deletions(-) create mode 100644 spec/oga/xml/lexer/multibyte_spec.rb diff --git a/lib/oga/xml/lexer.rl b/lib/oga/xml/lexer.rl index abee926..8502fb1 100644 --- a/lib/oga/xml/lexer.rl +++ b/lib/oga/xml/lexer.rl @@ -71,7 +71,7 @@ module Oga instance_variable_set("@#{key}", value) if respond_to?(key) end - @data = data.unpack('U*') + @data = data reset end @@ -90,7 +90,7 @@ module Oga @cs = self.class.lexer_start @act = 0 @elements = [] - @eof = @data.length + @eof = @data.bytesize @p = 0 @pe = @eof @@ -191,7 +191,7 @@ module Oga # @return [String] # def text(start = @ts, stop = @te) - return @data[start...stop].pack('U*') + return @data.byteslice(start, stop - start) end ## @@ -204,7 +204,6 @@ module Oga token = [type, value, @line] @block.call(token) - #@tokens << token end ## @@ -263,7 +262,7 @@ module Oga %%{ # Use instance variables for `ts` and friends. access @; - getkey (@data[@p] || 0); + getkey (@data.getbyte(@p) || 0); variable p @p; variable pe @pe; variable eof @eof; diff --git a/spec/oga/xml/lexer/multibyte_spec.rb b/spec/oga/xml/lexer/multibyte_spec.rb new file mode 100644 index 0000000..d1ea493 --- /dev/null +++ b/spec/oga/xml/lexer/multibyte_spec.rb @@ -0,0 +1,11 @@ +# encoding: utf-8 + +require 'spec_helper' + +describe Oga::XML::Lexer do + context 'multibyte text nodes' do + example 'lex a multibyte text node' do + lex('쿠키').should == [[:T_TEXT, '쿠키', 1]] + end + end +end