From 70516b744733fabbd099e513dd6fda0d897fbd63 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Thu, 17 Apr 2014 00:39:41 +0200 Subject: [PATCH] Yield tokens in the lexer and parser. After some digging I found out that Racc has a method called `yyparse`. Using this method (and a custom callback method) you can `yield` tokens as a form of input. This makes it a lot easier to feed tokens as a stream from the lexer. Sadly the current performance of the lexer is still total garbage. Most of the memory usage also comes from using String#unpack, especially on large XML inputs (e.g. 100 MB of XML). It looks like the resulting memory usage is about 10x the input size. One option might be some kind of wrapper around String. This wrapper would have a sliding window of, say, 1024 bytes. When you create it the first 1024 bytes of the input would be unpacked. When seeking through the input this window would move forward. In theory this means that you'd only end up with having only 1024 Fixnum instances around at any given time instead of "a very big number". I have to test how efficient this is in practise. --- lib/oga/xml/lexer.rl | 47 +++++++++++++++++++++++++++++--------------- lib/oga/xml/parser.y | 16 ++++++++------- 2 files changed, 40 insertions(+), 23 deletions(-) diff --git a/lib/oga/xml/lexer.rl b/lib/oga/xml/lexer.rl index 67311fb..abee926 100644 --- a/lib/oga/xml/lexer.rl +++ b/lib/oga/xml/lexer.rl @@ -7,13 +7,18 @@ module Oga # To lex HTML input set the `:html` option to `true` when creating an # instance of the lexer: # - # lexer = Oga::Lexer.new(:html => true) + # lexer = Oga::XML::Lexer.new(:html => true) # # @!attribute [r] html # @return [TrueClass|FalseClass] # + # @!attribute [r] tokens + # @return [Array] + # class Lexer - %% write data; # % + %% write data; + + # % fix highlight attr_reader :html @@ -80,7 +85,6 @@ module Oga @line = 1 @ts = nil @te = nil - @tokens = [] @stack = [] @top = 0 @cs = self.class.lexer_start @@ -94,12 +98,7 @@ module Oga end ## - # Lexes the supplied String and returns an Array of tokens. Each token is - # an Array in the following format: - # - # [TYPE, VALUE] - # - # The type is a symbol, the value is either nil or a String. + # Gathers all the tokens for the input and returns them as an Array. # # This method resets the internal state of the lexer after consuming the # input. @@ -111,7 +110,7 @@ module Oga def lex tokens = [] - while token = advance + advance do |token| tokens << token end @@ -121,17 +120,32 @@ module Oga end ## - # Advances through the input and generates the corresponding tokens. + # Advances through the input and generates the corresponding tokens. Each + # token is yielded to the supplied block. + # + # Each token is an Array in the following format: + # + # [TYPE, VALUE] + # + # The type is a symbol, the value is either nil or a String. + # + # This method stores the supplied block in `@block` and resets it after + # the lexer loop has finished. # # This method does *not* reset the internal state of the lexer. # + # # @param [String] data The String to consume. # @return [Array] # - def advance - %% write exec; # % fix highlight + def advance(&block) + @block = block - return @tokens.shift + %% write exec; + + # % fix highlight + ensure + @block = nil end ## @@ -189,7 +203,8 @@ module Oga def add_token(type, value = nil) token = [type, value, @line] - @tokens << token + @block.call(token) + #@tokens << token end ## @@ -463,7 +478,7 @@ module Oga add_token(:T_ELEM_NS, ns) end - @elements << name + @elements << name if html add_token(:T_ELEM_NAME, name) diff --git a/lib/oga/xml/parser.y b/lib/oga/xml/parser.y index a208799..d5ce39c 100644 --- a/lib/oga/xml/parser.y +++ b/lib/oga/xml/parser.y @@ -168,16 +168,18 @@ end end ## - # Returns the next token from the lexer. + # Yields the next token from the lexer. # - # @return [Array] + # @yieldparam [Array] # - def next_token - type, value, line = @lexer.advance + def yield_next_token + @lexer.advance do |(type, value, line)| + @line = line if line - @line = line if line + yield [type, value] + end - return type ? [type, value] : [false, false] + yield [false, false] end ## @@ -231,7 +233,7 @@ Unexpected #{name} with value #{value.inspect} on line #{@line}: # @return [Oga::AST::Node] # def parse - ast = do_parse + ast = yyparse(self, :yield_next_token) reset