Use index based buffers for text nodes.

Instead of appending single characters to a String buffer the lexer now uses a start and end position to figure out what the buffer is. This is a lot faster than constantly appending to a String.
2014-03-21 17:32:07 +01:00 · 2014-03-21 17:32:07 +01:00 · 9fa694ad4f
parent 2852afce9b
commit 9fa694ad4f
1 changed files with 51 additions and 14 deletions
--- a/lib/oga/lexer.rl
+++ b/lib/oga/lexer.rl
@ -81,7 +81,6 @@ module Oga
      @elements = []
      @string_buffer = ''
      @text_buffer   = ''
    end
    ##
@ -168,20 +167,46 @@ module Oga
      @tokens << token
    end
    ##
    # Enables text buffering starting at the given position.
    #
    # @param [Fixnum] position The start position of the buffer, set to `@te`
    #  by default.
    #
    def buffer_text(position = @te)
      @text_start_position = position
    end
    ##
    # Returns `true` if we're currently buffering text.
    #
    # @return [TrueClass|FalseClass]
    #
    def buffer_text?
      return !!@text_start_position
    end
    ##
    # Emits the current text buffer if we have any. The current line number is
    # advanced based on the amount of newlines in the buffer.
    #
-    def emit_text_buffer
+    # @param [Fixnum] position The end position of the buffer, set to `@ts` by
-      return if @text_buffer.empty?
+    #  default.
    #
    def emit_text_buffer(position = @ts)
      return unless @text_start_position
-      add_token(:T_TEXT, @text_buffer)
+      content = text(@text_start_position, position)
-      lines = @text_buffer.count("\n")
+      unless content.empty?
        add_token(:T_TEXT, content)
-      advance_line(lines) if lines > 0
+        lines = content.count("\n")
-      @text_buffer = ''
+        advance_line(lines) if lines > 0
      end
      @text_start_position = nil
    end
    ##
@ -230,10 +255,6 @@ module Oga
      dquote = '"';
      squote = "'";
      action buffer_text {
        @text_buffer << text
      }
      action buffer_string {
        @string_buffer << text
      }
@ -317,6 +338,9 @@ module Oga
      action start_cdata {
        emit_text_buffer
        t(:T_CDATA_START)
        buffer_text
        fcall cdata;
      }
@ -326,10 +350,11 @@ module Oga
        cdata_end => {
          emit_text_buffer
          t(:T_CDATA_END)
          fret;
        };
-        any => buffer_text;
+        any;
      *|;
      # Comments
@ -349,6 +374,9 @@ module Oga
      action start_comment {
        emit_text_buffer
        t(:T_COMMENT_START)
        buffer_text
        fcall comment;
      }
@ -358,10 +386,11 @@ module Oga
        comment_end => {
          emit_text_buffer
          t(:T_COMMENT_END)
          fret;
        };
-        any => buffer_text;
+        any;
      *|;
      # Elements
@ -453,7 +482,15 @@ module Oga
        # Note that this rule should be declared at the very bottom as it will
        # otherwise take precedence over the other rules.
-        any => { buffer_text_until_eof(eof) };
+        any => {
          # First character, start buffering (unless we already are buffering).
          buffer_text(@ts) unless buffer_text?
          # EOF, emit the text buffer.
          if @te == eof
            emit_text_buffer(@te)
          end
        };
      *|;
    }%%
  end # Lexer