Count newlines of text nodes in native code.
Instead of relying on String#count for counting newlines in text nodes, Oga now does this in C/Java. String#count isn't exactly the fastest way of counting characters. Performance was measured using benchmark/xml/lexer/string_average_bench.rb. Before this patch the results were as following: MRI: 0.529s Rbx: 4.965s JRuby: 0.622s After this patch: MRI: 0.424s Rbx: 1.942s JRuby: 0.665s => numbers vary a bit, seem roughly the same as before The commands used for benchmarking: $ rake clean # to make sure that C exts aren't shared between MRI/Rbx $ rake generate $ rake fixtures $ ruby benchmark/xml/lexer/string_average_bench.rb The big difference for Rbx is probably due to the implementation of String#count not being super fast. Some changes were made (https://github.com/rubinius/rubinius/pull/3133) to the method, but this hasn't been released yet. JRuby seems to perform in a similar way, so either it was already optimizing things for me or I suck at writing well performing Java code. This fixes #51.
This commit is contained in:
parent
4469ffc5b1
commit
8db77c0a09
|
@ -22,6 +22,9 @@ on `ts` and `te`) so the macro ignores this argument.
|
||||||
#define oga_ivar_set(owner, name, value) \
|
#define oga_ivar_set(owner, name, value) \
|
||||||
rb_ivar_set(owner, rb_intern(name), value)
|
rb_ivar_set(owner, rb_intern(name), value)
|
||||||
|
|
||||||
|
#define advance_line(amount) \
|
||||||
|
rb_funcall(self, rb_intern("advance_line"), 1, INT2NUM(amount));
|
||||||
|
|
||||||
%%machine c_lexer;
|
%%machine c_lexer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -84,8 +87,9 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
|
||||||
const char *te = 0;
|
const char *te = 0;
|
||||||
const char *mark = 0;
|
const char *mark = 0;
|
||||||
|
|
||||||
int act = NUM2INT(oga_ivar_get(self, "@act"));
|
int act = NUM2INT(oga_ivar_get(self, "@act"));
|
||||||
int cs = NUM2INT(oga_ivar_get(self, "@cs"));
|
int cs = NUM2INT(oga_ivar_get(self, "@cs"));
|
||||||
|
int lines = 0;
|
||||||
|
|
||||||
%% write exec;
|
%% write exec;
|
||||||
|
|
||||||
|
|
|
@ -90,12 +90,13 @@ public class Lexer extends RubyObject
|
||||||
|
|
||||||
byte[] data = rb_str.getBytes();
|
byte[] data = rb_str.getBytes();
|
||||||
|
|
||||||
int ts = 0;
|
int ts = 0;
|
||||||
int te = 0;
|
int te = 0;
|
||||||
int p = 0;
|
int p = 0;
|
||||||
int mark = 0;
|
int mark = 0;
|
||||||
int pe = data.length;
|
int lines = 0;
|
||||||
int eof = data.length;
|
int pe = data.length;
|
||||||
|
int eof = data.length;
|
||||||
|
|
||||||
%% write exec;
|
%% write exec;
|
||||||
|
|
||||||
|
@ -141,6 +142,17 @@ public class Lexer extends RubyObject
|
||||||
|
|
||||||
this.callMethod(context, name);
|
this.callMethod(context, name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Advances the line number by `amount` lines.
|
||||||
|
*/
|
||||||
|
public void advance_line(int amount)
|
||||||
|
{
|
||||||
|
ThreadContext context = this.runtime.getCurrentContext();
|
||||||
|
RubyFixnum lines = this.runtime.newFixnum(amount);
|
||||||
|
|
||||||
|
this.callMethod(context, "advance_line", lines);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
%%{
|
%%{
|
||||||
|
|
|
@ -35,7 +35,12 @@
|
||||||
# stack.
|
# stack.
|
||||||
#
|
#
|
||||||
|
|
||||||
newline = '\n' | '\r\n';
|
newline = '\n' | '\r\n';
|
||||||
|
|
||||||
|
action count_newlines {
|
||||||
|
if ( fc == '\n' ) lines++;
|
||||||
|
}
|
||||||
|
|
||||||
whitespace = [ \t];
|
whitespace = [ \t];
|
||||||
ident_char = [a-zA-Z0-9\-_];
|
ident_char = [a-zA-Z0-9\-_];
|
||||||
identifier = ident_char+;
|
identifier = ident_char+;
|
||||||
|
@ -289,14 +294,19 @@
|
||||||
# long. Because of this "<!" is used instead of "<!--".
|
# long. Because of this "<!" is used instead of "<!--".
|
||||||
|
|
||||||
terminate_text = '</' | '<!' | '<?' | element_start;
|
terminate_text = '</' | '<!' | '<?' | element_start;
|
||||||
allowed_text = any* -- terminate_text;
|
allowed_text = (any* -- terminate_text) $count_newlines;
|
||||||
|
|
||||||
text := |*
|
text := |*
|
||||||
# Input such as just "</" or "<?". This rule takes precedence over the
|
terminate_text | allowed_text => {
|
||||||
# rules below, but only if those don't match.
|
|
||||||
terminate_text => {
|
|
||||||
callback("on_text", data, encoding, ts, te);
|
callback("on_text", data, encoding, ts, te);
|
||||||
|
|
||||||
|
if ( lines > 0 )
|
||||||
|
{
|
||||||
|
advance_line(lines);
|
||||||
|
|
||||||
|
lines = 0;
|
||||||
|
}
|
||||||
|
|
||||||
fnext main;
|
fnext main;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -307,12 +317,13 @@
|
||||||
p = mark - 1;
|
p = mark - 1;
|
||||||
mark = 0;
|
mark = 0;
|
||||||
|
|
||||||
fnext main;
|
if ( lines > 0 )
|
||||||
};
|
{
|
||||||
|
advance_line(lines);
|
||||||
|
|
||||||
|
lines = 0;
|
||||||
|
}
|
||||||
|
|
||||||
# Just regular text.
|
|
||||||
allowed_text => {
|
|
||||||
callback("on_text", data, encoding, ts, te);
|
|
||||||
fnext main;
|
fnext main;
|
||||||
};
|
};
|
||||||
*|;
|
*|;
|
||||||
|
|
|
@ -348,13 +348,7 @@ module Oga
|
||||||
# @param [String] value
|
# @param [String] value
|
||||||
#
|
#
|
||||||
def on_text(value)
|
def on_text(value)
|
||||||
unless value.empty?
|
add_token(:T_TEXT, value) unless value.empty?
|
||||||
add_token(:T_TEXT, value)
|
|
||||||
|
|
||||||
lines = value.count("\n")
|
|
||||||
|
|
||||||
advance_line(lines) if lines > 0
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
##
|
##
|
||||||
|
|
Loading…
Reference in New Issue