Lex input as a sequence of bytes.

Instead of lexing the input as a raw String or as a set of codepoints it's
treated as a sequence of bytes. This removes the need of String#[] (replaced by
String#byteslice) which in turn reduces the amount of memory needed and speeds
up the lexing time.

Thanks to @headius and @apeiros for suggesting this and rubber ducking along!
This commit is contained in:
Yorick Peterse 2014-04-17 17:45:05 +02:00
parent 70516b7447
commit d9fa4b7c45
2 changed files with 15 additions and 5 deletions

View File

@ -71,7 +71,7 @@ module Oga
instance_variable_set("@#{key}", value) if respond_to?(key) instance_variable_set("@#{key}", value) if respond_to?(key)
end end
@data = data.unpack('U*') @data = data
reset reset
end end
@ -90,7 +90,7 @@ module Oga
@cs = self.class.lexer_start @cs = self.class.lexer_start
@act = 0 @act = 0
@elements = [] @elements = []
@eof = @data.length @eof = @data.bytesize
@p = 0 @p = 0
@pe = @eof @pe = @eof
@ -191,7 +191,7 @@ module Oga
# @return [String] # @return [String]
# #
def text(start = @ts, stop = @te) def text(start = @ts, stop = @te)
return @data[start...stop].pack('U*') return @data.byteslice(start, stop - start)
end end
## ##
@ -204,7 +204,6 @@ module Oga
token = [type, value, @line] token = [type, value, @line]
@block.call(token) @block.call(token)
#@tokens << token
end end
## ##
@ -263,7 +262,7 @@ module Oga
%%{ %%{
# Use instance variables for `ts` and friends. # Use instance variables for `ts` and friends.
access @; access @;
getkey (@data[@p] || 0); getkey (@data.getbyte(@p) || 0);
variable p @p; variable p @p;
variable pe @pe; variable pe @pe;
variable eof @eof; variable eof @eof;

View File

@ -0,0 +1,11 @@
# encoding: utf-8
require 'spec_helper'
describe Oga::XML::Lexer do
context 'multibyte text nodes' do
example 'lex a multibyte text node' do
lex('쿠키').should == [[:T_TEXT, '쿠키', 1]]
end
end
end