From 8db77c0a09bf6c996dd2856a6dbe1ad076b1d30a Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Thu, 25 Sep 2014 22:49:11 +0200 Subject: [PATCH] Count newlines of text nodes in native code. Instead of relying on String#count for counting newlines in text nodes, Oga now does this in C/Java. String#count isn't exactly the fastest way of counting characters. Performance was measured using benchmark/xml/lexer/string_average_bench.rb. Before this patch the results were as following: MRI: 0.529s Rbx: 4.965s JRuby: 0.622s After this patch: MRI: 0.424s Rbx: 1.942s JRuby: 0.665s => numbers vary a bit, seem roughly the same as before The commands used for benchmarking: $ rake clean # to make sure that C exts aren't shared between MRI/Rbx $ rake generate $ rake fixtures $ ruby benchmark/xml/lexer/string_average_bench.rb The big difference for Rbx is probably due to the implementation of String#count not being super fast. Some changes were made (https://github.com/rubinius/rubinius/pull/3133) to the method, but this hasn't been released yet. JRuby seems to perform in a similar way, so either it was already optimizing things for me or I suck at writing well performing Java code. This fixes #51. --- ext/c/lexer.rl | 8 ++++++-- ext/java/org/liboga/xml/Lexer.rl | 24 ++++++++++++++++++------ ext/ragel/base_lexer.rl | 31 +++++++++++++++++++++---------- lib/oga/xml/lexer.rb | 8 +------- 4 files changed, 46 insertions(+), 25 deletions(-) diff --git a/ext/c/lexer.rl b/ext/c/lexer.rl index 6505a7e..43a135c 100644 --- a/ext/c/lexer.rl +++ b/ext/c/lexer.rl @@ -22,6 +22,9 @@ on `ts` and `te`) so the macro ignores this argument. #define oga_ivar_set(owner, name, value) \ rb_ivar_set(owner, rb_intern(name), value) +#define advance_line(amount) \ + rb_funcall(self, rb_intern("advance_line"), 1, INT2NUM(amount)); + %%machine c_lexer; /** @@ -84,8 +87,9 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block) const char *te = 0; const char *mark = 0; - int act = NUM2INT(oga_ivar_get(self, "@act")); - int cs = NUM2INT(oga_ivar_get(self, "@cs")); + int act = NUM2INT(oga_ivar_get(self, "@act")); + int cs = NUM2INT(oga_ivar_get(self, "@cs")); + int lines = 0; %% write exec; diff --git a/ext/java/org/liboga/xml/Lexer.rl b/ext/java/org/liboga/xml/Lexer.rl index 9449a34..a6d3327 100644 --- a/ext/java/org/liboga/xml/Lexer.rl +++ b/ext/java/org/liboga/xml/Lexer.rl @@ -90,12 +90,13 @@ public class Lexer extends RubyObject byte[] data = rb_str.getBytes(); - int ts = 0; - int te = 0; - int p = 0; - int mark = 0; - int pe = data.length; - int eof = data.length; + int ts = 0; + int te = 0; + int p = 0; + int mark = 0; + int lines = 0; + int pe = data.length; + int eof = data.length; %% write exec; @@ -141,6 +142,17 @@ public class Lexer extends RubyObject this.callMethod(context, name); } + + /** + * Advances the line number by `amount` lines. + */ + public void advance_line(int amount) + { + ThreadContext context = this.runtime.getCurrentContext(); + RubyFixnum lines = this.runtime.newFixnum(amount); + + this.callMethod(context, "advance_line", lines); + } } %%{ diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index 8f16976..28dd337 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -35,7 +35,12 @@ # stack. # - newline = '\n' | '\r\n'; + newline = '\n' | '\r\n'; + + action count_newlines { + if ( fc == '\n' ) lines++; + } + whitespace = [ \t]; ident_char = [a-zA-Z0-9\-_]; identifier = ident_char+; @@ -289,14 +294,19 @@ # long. Because of this " { + terminate_text | allowed_text => { callback("on_text", data, encoding, ts, te); + if ( lines > 0 ) + { + advance_line(lines); + + lines = 0; + } + fnext main; }; @@ -307,12 +317,13 @@ p = mark - 1; mark = 0; - fnext main; - }; + if ( lines > 0 ) + { + advance_line(lines); + + lines = 0; + } - # Just regular text. - allowed_text => { - callback("on_text", data, encoding, ts, te); fnext main; }; *|; diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb index 605e75a..be91028 100644 --- a/lib/oga/xml/lexer.rb +++ b/lib/oga/xml/lexer.rb @@ -348,13 +348,7 @@ module Oga # @param [String] value # def on_text(value) - unless value.empty? - add_token(:T_TEXT, value) - - lines = value.count("\n") - - advance_line(lines) if lines > 0 - end + add_token(:T_TEXT, value) unless value.empty? end ##