Count newlines of text nodes in native code.
Instead of relying on String#count for counting newlines in text nodes, Oga now does this in C/Java. String#count isn't exactly the fastest way of counting characters. Performance was measured using benchmark/xml/lexer/string_average_bench.rb. Before this patch the results were as following: MRI: 0.529s Rbx: 4.965s JRuby: 0.622s After this patch: MRI: 0.424s Rbx: 1.942s JRuby: 0.665s => numbers vary a bit, seem roughly the same as before The commands used for benchmarking: $ rake clean # to make sure that C exts aren't shared between MRI/Rbx $ rake generate $ rake fixtures $ ruby benchmark/xml/lexer/string_average_bench.rb The big difference for Rbx is probably due to the implementation of String#count not being super fast. Some changes were made (https://github.com/rubinius/rubinius/pull/3133) to the method, but this hasn't been released yet. JRuby seems to perform in a similar way, so either it was already optimizing things for me or I suck at writing well performing Java code. This fixes #51.
This commit is contained in:
parent
4469ffc5b1
commit
8db77c0a09
|
@ -22,6 +22,9 @@ on `ts` and `te`) so the macro ignores this argument.
|
|||
#define oga_ivar_set(owner, name, value) \
|
||||
rb_ivar_set(owner, rb_intern(name), value)
|
||||
|
||||
#define advance_line(amount) \
|
||||
rb_funcall(self, rb_intern("advance_line"), 1, INT2NUM(amount));
|
||||
|
||||
%%machine c_lexer;
|
||||
|
||||
/**
|
||||
|
@ -84,8 +87,9 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
|
|||
const char *te = 0;
|
||||
const char *mark = 0;
|
||||
|
||||
int act = NUM2INT(oga_ivar_get(self, "@act"));
|
||||
int cs = NUM2INT(oga_ivar_get(self, "@cs"));
|
||||
int act = NUM2INT(oga_ivar_get(self, "@act"));
|
||||
int cs = NUM2INT(oga_ivar_get(self, "@cs"));
|
||||
int lines = 0;
|
||||
|
||||
%% write exec;
|
||||
|
||||
|
|
|
@ -90,12 +90,13 @@ public class Lexer extends RubyObject
|
|||
|
||||
byte[] data = rb_str.getBytes();
|
||||
|
||||
int ts = 0;
|
||||
int te = 0;
|
||||
int p = 0;
|
||||
int mark = 0;
|
||||
int pe = data.length;
|
||||
int eof = data.length;
|
||||
int ts = 0;
|
||||
int te = 0;
|
||||
int p = 0;
|
||||
int mark = 0;
|
||||
int lines = 0;
|
||||
int pe = data.length;
|
||||
int eof = data.length;
|
||||
|
||||
%% write exec;
|
||||
|
||||
|
@ -141,6 +142,17 @@ public class Lexer extends RubyObject
|
|||
|
||||
this.callMethod(context, name);
|
||||
}
|
||||
|
||||
/**
|
||||
* Advances the line number by `amount` lines.
|
||||
*/
|
||||
public void advance_line(int amount)
|
||||
{
|
||||
ThreadContext context = this.runtime.getCurrentContext();
|
||||
RubyFixnum lines = this.runtime.newFixnum(amount);
|
||||
|
||||
this.callMethod(context, "advance_line", lines);
|
||||
}
|
||||
}
|
||||
|
||||
%%{
|
||||
|
|
|
@ -35,7 +35,12 @@
|
|||
# stack.
|
||||
#
|
||||
|
||||
newline = '\n' | '\r\n';
|
||||
newline = '\n' | '\r\n';
|
||||
|
||||
action count_newlines {
|
||||
if ( fc == '\n' ) lines++;
|
||||
}
|
||||
|
||||
whitespace = [ \t];
|
||||
ident_char = [a-zA-Z0-9\-_];
|
||||
identifier = ident_char+;
|
||||
|
@ -289,14 +294,19 @@
|
|||
# long. Because of this "<!" is used instead of "<!--".
|
||||
|
||||
terminate_text = '</' | '<!' | '<?' | element_start;
|
||||
allowed_text = any* -- terminate_text;
|
||||
allowed_text = (any* -- terminate_text) $count_newlines;
|
||||
|
||||
text := |*
|
||||
# Input such as just "</" or "<?". This rule takes precedence over the
|
||||
# rules below, but only if those don't match.
|
||||
terminate_text => {
|
||||
terminate_text | allowed_text => {
|
||||
callback("on_text", data, encoding, ts, te);
|
||||
|
||||
if ( lines > 0 )
|
||||
{
|
||||
advance_line(lines);
|
||||
|
||||
lines = 0;
|
||||
}
|
||||
|
||||
fnext main;
|
||||
};
|
||||
|
||||
|
@ -307,12 +317,13 @@
|
|||
p = mark - 1;
|
||||
mark = 0;
|
||||
|
||||
fnext main;
|
||||
};
|
||||
if ( lines > 0 )
|
||||
{
|
||||
advance_line(lines);
|
||||
|
||||
lines = 0;
|
||||
}
|
||||
|
||||
# Just regular text.
|
||||
allowed_text => {
|
||||
callback("on_text", data, encoding, ts, te);
|
||||
fnext main;
|
||||
};
|
||||
*|;
|
||||
|
|
|
@ -348,13 +348,7 @@ module Oga
|
|||
# @param [String] value
|
||||
#
|
||||
def on_text(value)
|
||||
unless value.empty?
|
||||
add_token(:T_TEXT, value)
|
||||
|
||||
lines = value.count("\n")
|
||||
|
||||
advance_line(lines) if lines > 0
|
||||
end
|
||||
add_token(:T_TEXT, value) unless value.empty?
|
||||
end
|
||||
|
||||
##
|
||||
|
|
Loading…
Reference in New Issue