Lex input as a sequence of bytes.
Instead of lexing the input as a raw String or as a set of codepoints it's treated as a sequence of bytes. This removes the need of String#[] (replaced by String#byteslice) which in turn reduces the amount of memory needed and speeds up the lexing time. Thanks to @headius and @apeiros for suggesting this and rubber ducking along!
This commit is contained in:
parent
70516b7447
commit
d9fa4b7c45
|
@ -71,7 +71,7 @@ module Oga
|
|||
instance_variable_set("@#{key}", value) if respond_to?(key)
|
||||
end
|
||||
|
||||
@data = data.unpack('U*')
|
||||
@data = data
|
||||
|
||||
reset
|
||||
end
|
||||
|
@ -90,7 +90,7 @@ module Oga
|
|||
@cs = self.class.lexer_start
|
||||
@act = 0
|
||||
@elements = []
|
||||
@eof = @data.length
|
||||
@eof = @data.bytesize
|
||||
@p = 0
|
||||
@pe = @eof
|
||||
|
||||
|
@ -191,7 +191,7 @@ module Oga
|
|||
# @return [String]
|
||||
#
|
||||
def text(start = @ts, stop = @te)
|
||||
return @data[start...stop].pack('U*')
|
||||
return @data.byteslice(start, stop - start)
|
||||
end
|
||||
|
||||
##
|
||||
|
@ -204,7 +204,6 @@ module Oga
|
|||
token = [type, value, @line]
|
||||
|
||||
@block.call(token)
|
||||
#@tokens << token
|
||||
end
|
||||
|
||||
##
|
||||
|
@ -263,7 +262,7 @@ module Oga
|
|||
%%{
|
||||
# Use instance variables for `ts` and friends.
|
||||
access @;
|
||||
getkey (@data[@p] || 0);
|
||||
getkey (@data.getbyte(@p) || 0);
|
||||
variable p @p;
|
||||
variable pe @pe;
|
||||
variable eof @eof;
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
# encoding: utf-8
|
||||
|
||||
require 'spec_helper'
|
||||
|
||||
describe Oga::XML::Lexer do
|
||||
context 'multibyte text nodes' do
|
||||
example 'lex a multibyte text node' do
|
||||
lex('쿠키').should == [[:T_TEXT, '쿠키', 1]]
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue