Count newlines of text nodes in native code.

Instead of relying on String#count for counting newlines in text nodes, Oga now does this in C/Java. String#count isn't exactly the fastest way of counting characters. Performance was measured using benchmark/xml/lexer/string_average_bench.rb. Before this patch the results were as following: MRI: 0.529s Rbx: 4.965s JRuby: 0.622s After this patch: MRI: 0.424s Rbx: 1.942s JRuby: 0.665s => numbers vary a bit, seem roughly the same as before The commands used for benchmarking: $ rake clean # to make sure that C exts aren't shared between MRI/Rbx $ rake generate $ rake fixtures $ ruby benchmark/xml/lexer/string_average_bench.rb The big difference for Rbx is probably due to the implementation of String#count not being super fast. Some changes were made (https://github.com/rubinius/rubinius/pull/3133) to the method, but this hasn't been released yet. JRuby seems to perform in a similar way, so either it was already optimizing things for me or I suck at writing well performing Java code. This fixes #51.
2014-09-25 22:49:11 +02:00 · 2014-09-25 22:49:11 +02:00 · 8db77c0a09
parent 4469ffc5b1
commit 8db77c0a09
4 changed files with 46 additions and 25 deletions
--- a/ext/c/lexer.rl
+++ b/ext/c/lexer.rl
@ -22,6 +22,9 @@ on `ts` and `te`) so the macro ignores this argument.
 #define oga_ivar_set(owner, name, value) \
    rb_ivar_set(owner, rb_intern(name), value)
 #define advance_line(amount) \
    rb_funcall(self, rb_intern("advance_line"), 1, INT2NUM(amount));
 %%machine c_lexer;
 /**
@ -84,8 +87,9 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
    const char *te   = 0;
    const char *mark = 0;
-    int act = NUM2INT(oga_ivar_get(self, "@act"));
+    int act   = NUM2INT(oga_ivar_get(self, "@act"));
-    int cs  = NUM2INT(oga_ivar_get(self, "@cs"));
+    int cs    = NUM2INT(oga_ivar_get(self, "@cs"));
    int lines = 0;
    %% write exec;
--- a/ext/java/org/liboga/xml/Lexer.rl
+++ b/ext/java/org/liboga/xml/Lexer.rl
@ -90,12 +90,13 @@ public class Lexer extends RubyObject
        byte[] data = rb_str.getBytes();
-        int ts   = 0;
+        int ts    = 0;
-        int te   = 0;
+        int te    = 0;
-        int p    = 0;
+        int p     = 0;
-        int mark = 0;
+        int mark  = 0;
-        int pe   = data.length;
+        int lines = 0;
-        int eof  = data.length;
+        int pe    = data.length;
        int eof   = data.length;
        %% write exec;
@ -141,6 +142,17 @@ public class Lexer extends RubyObject
        this.callMethod(context, name);
    }
    /**
     * Advances the line number by `amount` lines.
     */
    public void advance_line(int amount)
    {
        ThreadContext context = this.runtime.getCurrentContext();
        RubyFixnum lines      = this.runtime.newFixnum(amount);
        this.callMethod(context, "advance_line", lines);
    }
 }
 %%{
--- a/ext/ragel/base_lexer.rl
+++ b/ext/ragel/base_lexer.rl
@ -35,7 +35,12 @@
    # stack.
    #
-    newline    = '\n' | '\r\n';
+    newline = '\n' | '\r\n';
    action count_newlines {
        if ( fc == '\n' ) lines++;
    }
    whitespace = [ \t];
    ident_char = [a-zA-Z0-9\-_];
    identifier = ident_char+;
@ -289,14 +294,19 @@
    # long. Because of this "<!" is used instead of "<!--".
    terminate_text = '</' | '<!' | '<?' | element_start;
-    allowed_text   = any* -- terminate_text;
+    allowed_text   = (any* -- terminate_text) $count_newlines;
    text := |*
-        # Input such as just "</" or "<?". This rule takes precedence over the
+        terminate_text | allowed_text => {
        # rules below, but only if those don't match.
        terminate_text => {
            callback("on_text", data, encoding, ts, te);
            if ( lines > 0 )
            {
                advance_line(lines);
                lines = 0;
            }
            fnext main;
        };
@ -307,12 +317,13 @@
            p    = mark - 1;
            mark = 0;
-            fnext main;
+            if ( lines > 0 )
-        };
+            {
                advance_line(lines);
                lines = 0;
            }
        # Just regular text.
        allowed_text => {
            callback("on_text", data, encoding, ts, te);
            fnext main;
        };
    *|;
--- a/lib/oga/xml/lexer.rb
+++ b/lib/oga/xml/lexer.rb
@ -348,13 +348,7 @@ module Oga
      # @param [String] value
      #
      def on_text(value)
-        unless value.empty?
+        add_token(:T_TEXT, value) unless value.empty?
          add_token(:T_TEXT, value)
          lines = value.count("\n")
          advance_line(lines) if lines > 0
        end
      end
      ##