From a2452b63714a7f7f11b99f03bc426e0bd2f7e7c0 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Sun, 23 Mar 2014 20:20:07 +0100 Subject: [PATCH] Use codepoints instead of chars in the lexer. Grand wizard overlord @whitequark recommended this as it will bypass the need for creating individual String instance for every character (at least not until needed). This becomes noticable on large inputs (e.g. 100 MB of XML). Previously these would result in the kernel OOM killing the process. Using codepoints memory increase by a "mere" 1-1,5 GB. --- lib/oga/lexer.rl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/oga/lexer.rl b/lib/oga/lexer.rl index 31d2d02..2f4560d 100644 --- a/lib/oga/lexer.rl +++ b/lib/oga/lexer.rl @@ -95,7 +95,7 @@ module Oga # @return [Array] # def lex(data) - @data = data.chars.to_a + @data = data.codepoints lexer_start = self.class.lexer_start eof = data.length @@ -152,7 +152,7 @@ module Oga # @return [String] # def text(start = @ts, stop = @te) - return @data[start...stop].join('') + return @data[start...stop].pack('U*') end ## @@ -223,6 +223,7 @@ module Oga %%{ # Use instance variables for `ts` and friends. access @; + getkey (@data[p] || 0); newline = '\n' | '\r\n'; whitespace = [ \t];