From a2452b63714a7f7f11b99f03bc426e0bd2f7e7c0 Mon Sep 17 00:00:00 2001
From: Yorick Peterse <yorickpeterse@gmail.com>
Date: Sun, 23 Mar 2014 20:20:07 +0100
Subject: [PATCH] Use codepoints instead of chars in the lexer.

Grand wizard overlord @whitequark recommended this as it will bypass the need
for creating individual String instance for every character (at least not until
needed). This becomes noticable on large inputs (e.g. 100 MB of XML).
Previously these would result in the kernel OOM killing the process. Using
codepoints memory increase by a "mere" 1-1,5 GB.
---
 lib/oga/lexer.rl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/oga/lexer.rl b/lib/oga/lexer.rl
index 31d2d02..2f4560d 100644
--- a/lib/oga/lexer.rl
+++ b/lib/oga/lexer.rl
@@ -95,7 +95,7 @@ module Oga
     # @return [Array]
     #
     def lex(data)
-      @data       = data.chars.to_a
+      @data       = data.codepoints
       lexer_start = self.class.lexer_start
       eof         = data.length
 
@@ -152,7 +152,7 @@ module Oga
     # @return [String]
     #
     def text(start = @ts, stop = @te)
-      return @data[start...stop].join('')
+      return @data[start...stop].pack('U*')
     end
 
     ##
@@ -223,6 +223,7 @@ module Oga
     %%{
       # Use instance variables for `ts` and friends.
       access @;
+      getkey (@data[p] || 0);
 
       newline    = '\n' | '\r\n';
       whitespace = [ \t];