Use codepoints instead of chars in the lexer.
Grand wizard overlord @whitequark recommended this as it will bypass the need for creating individual String instance for every character (at least not until needed). This becomes noticable on large inputs (e.g. 100 MB of XML). Previously these would result in the kernel OOM killing the process. Using codepoints memory increase by a "mere" 1-1,5 GB.
This commit is contained in:
parent
cdf5f1d541
commit
a2452b6371
|
@ -95,7 +95,7 @@ module Oga
|
||||||
# @return [Array]
|
# @return [Array]
|
||||||
#
|
#
|
||||||
def lex(data)
|
def lex(data)
|
||||||
@data = data.chars.to_a
|
@data = data.codepoints
|
||||||
lexer_start = self.class.lexer_start
|
lexer_start = self.class.lexer_start
|
||||||
eof = data.length
|
eof = data.length
|
||||||
|
|
||||||
|
@ -152,7 +152,7 @@ module Oga
|
||||||
# @return [String]
|
# @return [String]
|
||||||
#
|
#
|
||||||
def text(start = @ts, stop = @te)
|
def text(start = @ts, stop = @te)
|
||||||
return @data[start...stop].join('')
|
return @data[start...stop].pack('U*')
|
||||||
end
|
end
|
||||||
|
|
||||||
##
|
##
|
||||||
|
@ -223,6 +223,7 @@ module Oga
|
||||||
%%{
|
%%{
|
||||||
# Use instance variables for `ts` and friends.
|
# Use instance variables for `ts` and friends.
|
||||||
access @;
|
access @;
|
||||||
|
getkey (@data[p] || 0);
|
||||||
|
|
||||||
newline = '\n' | '\r\n';
|
newline = '\n' | '\r\n';
|
||||||
whitespace = [ \t];
|
whitespace = [ \t];
|
||||||
|
|
Loading…
Reference in New Issue