Count newlines of text nodes in native code.

Instead of relying on String#count for counting newlines in text nodes, Oga now
does this in C/Java. String#count isn't exactly the fastest way of counting
characters. Performance was measured using
benchmark/xml/lexer/string_average_bench.rb. Before this patch the results were
as following:

    MRI:   0.529s
    Rbx:   4.965s
    JRuby: 0.622s

After this patch:

    MRI:   0.424s
    Rbx:   1.942s
    JRuby: 0.665s => numbers vary a bit, seem roughly the same as before

The commands used for benchmarking:

    $ rake clean # to make sure that C exts aren't shared between MRI/Rbx
    $ rake generate
    $ rake fixtures
    $ ruby benchmark/xml/lexer/string_average_bench.rb

The big difference for Rbx is probably due to the implementation of String#count
not being super fast. Some changes were made
(https://github.com/rubinius/rubinius/pull/3133) to the method, but this hasn't
been released yet.

JRuby seems to perform in a similar way, so either it was already optimizing
things for me or I suck at writing well performing Java code.

This fixes #51.
This commit is contained in:
Yorick Peterse 2014-09-25 22:49:11 +02:00
parent 4469ffc5b1
commit 8db77c0a09
4 changed files with 46 additions and 25 deletions

View File

@ -22,6 +22,9 @@ on `ts` and `te`) so the macro ignores this argument.
#define oga_ivar_set(owner, name, value) \ #define oga_ivar_set(owner, name, value) \
rb_ivar_set(owner, rb_intern(name), value) rb_ivar_set(owner, rb_intern(name), value)
#define advance_line(amount) \
rb_funcall(self, rb_intern("advance_line"), 1, INT2NUM(amount));
%%machine c_lexer; %%machine c_lexer;
/** /**
@ -86,6 +89,7 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
int act = NUM2INT(oga_ivar_get(self, "@act")); int act = NUM2INT(oga_ivar_get(self, "@act"));
int cs = NUM2INT(oga_ivar_get(self, "@cs")); int cs = NUM2INT(oga_ivar_get(self, "@cs"));
int lines = 0;
%% write exec; %% write exec;

View File

@ -94,6 +94,7 @@ public class Lexer extends RubyObject
int te = 0; int te = 0;
int p = 0; int p = 0;
int mark = 0; int mark = 0;
int lines = 0;
int pe = data.length; int pe = data.length;
int eof = data.length; int eof = data.length;
@ -141,6 +142,17 @@ public class Lexer extends RubyObject
this.callMethod(context, name); this.callMethod(context, name);
} }
/**
* Advances the line number by `amount` lines.
*/
public void advance_line(int amount)
{
ThreadContext context = this.runtime.getCurrentContext();
RubyFixnum lines = this.runtime.newFixnum(amount);
this.callMethod(context, "advance_line", lines);
}
} }
%%{ %%{

View File

@ -36,6 +36,11 @@
# #
newline = '\n' | '\r\n'; newline = '\n' | '\r\n';
action count_newlines {
if ( fc == '\n' ) lines++;
}
whitespace = [ \t]; whitespace = [ \t];
ident_char = [a-zA-Z0-9\-_]; ident_char = [a-zA-Z0-9\-_];
identifier = ident_char+; identifier = ident_char+;
@ -289,14 +294,19 @@
# long. Because of this "<!" is used instead of "<!--". # long. Because of this "<!" is used instead of "<!--".
terminate_text = '</' | '<!' | '<?' | element_start; terminate_text = '</' | '<!' | '<?' | element_start;
allowed_text = any* -- terminate_text; allowed_text = (any* -- terminate_text) $count_newlines;
text := |* text := |*
# Input such as just "</" or "<?". This rule takes precedence over the terminate_text | allowed_text => {
# rules below, but only if those don't match.
terminate_text => {
callback("on_text", data, encoding, ts, te); callback("on_text", data, encoding, ts, te);
if ( lines > 0 )
{
advance_line(lines);
lines = 0;
}
fnext main; fnext main;
}; };
@ -307,12 +317,13 @@
p = mark - 1; p = mark - 1;
mark = 0; mark = 0;
fnext main; if ( lines > 0 )
}; {
advance_line(lines);
lines = 0;
}
# Just regular text.
allowed_text => {
callback("on_text", data, encoding, ts, te);
fnext main; fnext main;
}; };
*|; *|;

View File

@ -348,13 +348,7 @@ module Oga
# @param [String] value # @param [String] value
# #
def on_text(value) def on_text(value)
unless value.empty? add_token(:T_TEXT, value) unless value.empty?
add_token(:T_TEXT, value)
lines = value.count("\n")
advance_line(lines) if lines > 0
end
end end
## ##