Lex input as a sequence of bytes.
Instead of lexing the input as a raw String or as a set of codepoints it's treated as a sequence of bytes. This removes the need of String#[] (replaced by String#byteslice) which in turn reduces the amount of memory needed and speeds up the lexing time. Thanks to @headius and @apeiros for suggesting this and rubber ducking along!
This commit is contained in:
parent
70516b7447
commit
d9fa4b7c45
|
@ -71,7 +71,7 @@ module Oga
|
||||||
instance_variable_set("@#{key}", value) if respond_to?(key)
|
instance_variable_set("@#{key}", value) if respond_to?(key)
|
||||||
end
|
end
|
||||||
|
|
||||||
@data = data.unpack('U*')
|
@data = data
|
||||||
|
|
||||||
reset
|
reset
|
||||||
end
|
end
|
||||||
|
@ -90,7 +90,7 @@ module Oga
|
||||||
@cs = self.class.lexer_start
|
@cs = self.class.lexer_start
|
||||||
@act = 0
|
@act = 0
|
||||||
@elements = []
|
@elements = []
|
||||||
@eof = @data.length
|
@eof = @data.bytesize
|
||||||
@p = 0
|
@p = 0
|
||||||
@pe = @eof
|
@pe = @eof
|
||||||
|
|
||||||
|
@ -191,7 +191,7 @@ module Oga
|
||||||
# @return [String]
|
# @return [String]
|
||||||
#
|
#
|
||||||
def text(start = @ts, stop = @te)
|
def text(start = @ts, stop = @te)
|
||||||
return @data[start...stop].pack('U*')
|
return @data.byteslice(start, stop - start)
|
||||||
end
|
end
|
||||||
|
|
||||||
##
|
##
|
||||||
|
@ -204,7 +204,6 @@ module Oga
|
||||||
token = [type, value, @line]
|
token = [type, value, @line]
|
||||||
|
|
||||||
@block.call(token)
|
@block.call(token)
|
||||||
#@tokens << token
|
|
||||||
end
|
end
|
||||||
|
|
||||||
##
|
##
|
||||||
|
@ -263,7 +262,7 @@ module Oga
|
||||||
%%{
|
%%{
|
||||||
# Use instance variables for `ts` and friends.
|
# Use instance variables for `ts` and friends.
|
||||||
access @;
|
access @;
|
||||||
getkey (@data[@p] || 0);
|
getkey (@data.getbyte(@p) || 0);
|
||||||
variable p @p;
|
variable p @p;
|
||||||
variable pe @pe;
|
variable pe @pe;
|
||||||
variable eof @eof;
|
variable eof @eof;
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
# encoding: utf-8
|
||||||
|
|
||||||
|
require 'spec_helper'
|
||||||
|
|
||||||
|
describe Oga::XML::Lexer do
|
||||||
|
context 'multibyte text nodes' do
|
||||||
|
example 'lex a multibyte text node' do
|
||||||
|
lex('쿠키').should == [[:T_TEXT, '쿠키', 1]]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in New Issue