Cache rb_intern/symbol lookups in the lexer.

For JRuby this has little to no benefits as it uses strings for method names.
However, both MRI and Rubinius will perform a Symbol lookup whenever rb_intern()
is called. By doing this once for all callback names and caching the resulting
VALUE objects the lexer timings can be reduced by about 25%. In case of the
benchmark benchmark/xml/lexer/string_average_bench.rb this means it runs in
around 500ms instead of 700ms.
This commit is contained in:
Yorick Peterse 2014-11-21 01:37:33 +01:00
parent a10fe855d7
commit 4fa88fcbde
3 changed files with 84 additions and 37 deletions

View File

@ -38,16 +38,15 @@ on `ts` and `te`) so the macro ignores this argument.
*/ */
void liboga_xml_lexer_callback( void liboga_xml_lexer_callback(
VALUE self, VALUE self,
const char *name, VALUE name,
rb_encoding *encoding, rb_encoding *encoding,
const char *ts, const char *ts,
const char *te const char *te
) )
{ {
VALUE value = rb_enc_str_new(ts, te - ts, encoding); VALUE value = rb_enc_str_new(ts, te - ts, encoding);
VALUE method = rb_intern(name);
rb_funcall(self, method, 1, value); rb_funcall(self, name, 1, value);
} }
/** /**
@ -57,11 +56,9 @@ void liboga_xml_lexer_callback(
* @example * @example
* liboga_xml_lexer_callback_simple(self, "on_cdata_start"); * liboga_xml_lexer_callback_simple(self, "on_cdata_start");
*/ */
void liboga_xml_lexer_callback_simple(VALUE self, const char *name) void liboga_xml_lexer_callback_simple(VALUE self, VALUE name)
{ {
VALUE method = rb_intern(name); rb_funcall(self, name, 0);
rb_funcall(self, method, 0);
} }
%% write data; %% write data;
@ -93,6 +90,31 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
int lines = state->lines; int lines = state->lines;
VALUE id_advance_line = rb_intern("advance_line");
VALUE id_on_attribute = rb_intern("on_attribute");
VALUE id_on_attribute_ns = rb_intern("on_attribute_ns");
VALUE id_on_cdata = rb_intern("on_cdata");
VALUE id_on_comment = rb_intern("on_comment");
VALUE id_on_doctype_end = rb_intern("on_doctype_end");
VALUE id_on_doctype_inline = rb_intern("on_doctype_inline");
VALUE id_on_doctype_name = rb_intern("on_doctype_name");
VALUE id_on_doctype_start = rb_intern("on_doctype_start");
VALUE id_on_doctype_type = rb_intern("on_doctype_type");
VALUE id_on_element_end = rb_intern("on_element_end");
VALUE id_on_element_name = rb_intern("on_element_name");
VALUE id_on_element_ns = rb_intern("on_element_ns");
VALUE id_on_element_open_end = rb_intern("on_element_open_end");
VALUE id_on_element_start = rb_intern("on_element_start");
VALUE id_on_proc_ins_end = rb_intern("on_proc_ins_end");
VALUE id_on_proc_ins_name = rb_intern("on_proc_ins_name");
VALUE id_on_proc_ins_start = rb_intern("on_proc_ins_start");
VALUE id_on_string_body = rb_intern("on_string_body");
VALUE id_on_string_dquote = rb_intern("on_string_dquote");
VALUE id_on_string_squote = rb_intern("on_string_squote");
VALUE id_on_text = rb_intern("on_text");
VALUE id_on_xml_decl_end = rb_intern("on_xml_decl_end");
VALUE id_on_xml_decl_start = rb_intern("on_xml_decl_start");
%% write exec; %% write exec;
state->lines = lines; state->lines = lines;

View File

@ -101,6 +101,31 @@ public class Lexer extends RubyObject
int pe = data.length; int pe = data.length;
int eof = data.length; int eof = data.length;
String id_advance_line = "advance_line";
String id_on_attribute = "on_attribute";
String id_on_attribute_ns = "on_attribute_ns";
String id_on_cdata = "on_cdata";
String id_on_comment = "on_comment";
String id_on_doctype_end = "on_doctype_end";
String id_on_doctype_inline = "on_doctype_inline";
String id_on_doctype_name = "on_doctype_name";
String id_on_doctype_start = "on_doctype_start";
String id_on_doctype_type = "on_doctype_type";
String id_on_element_end = "on_element_end";
String id_on_element_name = "on_element_name";
String id_on_element_ns = "on_element_ns";
String id_on_element_open_end = "on_element_open_end";
String id_on_element_start = "on_element_start";
String id_on_proc_ins_end = "on_proc_ins_end";
String id_on_proc_ins_name = "on_proc_ins_name";
String id_on_proc_ins_start = "on_proc_ins_start";
String id_on_string_body = "on_string_body";
String id_on_string_dquote = "on_string_dquote";
String id_on_string_squote = "on_string_squote";
String id_on_text = "on_text";
String id_on_xml_decl_end = "on_xml_decl_end";
String id_on_xml_decl_start = "on_xml_decl_start";
%% write exec; %% write exec;
this.lines = lines; this.lines = lines;

View File

@ -59,7 +59,7 @@
comment = comment_start (any* -- comment_end) comment_end; comment = comment_start (any* -- comment_end) comment_end;
action start_comment { action start_comment {
callback("on_comment", data, encoding, ts + 4, te - 3); callback(id_on_comment, data, encoding, ts + 4, te - 3);
} }
# CDATA # CDATA
@ -75,7 +75,7 @@
cdata = cdata_start (any* -- cdata_end) cdata_end; cdata = cdata_start (any* -- cdata_end) cdata_end;
action start_cdata { action start_cdata {
callback("on_cdata", data, encoding, ts + 9, te - 3); callback(id_on_cdata, data, encoding, ts + 9, te - 3);
} }
# Processing Instructions # Processing Instructions
@ -93,8 +93,8 @@
proc_ins_end = '?>'; proc_ins_end = '?>';
action start_proc_ins { action start_proc_ins {
callback_simple("on_proc_ins_start"); callback_simple(id_on_proc_ins_start);
callback("on_proc_ins_name", data, encoding, ts + 2, te); callback(id_on_proc_ins_name, data, encoding, ts + 2, te);
mark = te; mark = te;
@ -103,8 +103,8 @@
proc_ins_body := |* proc_ins_body := |*
proc_ins_end => { proc_ins_end => {
callback("on_text", data, encoding, mark, ts); callback(id_on_text, data, encoding, mark, ts);
callback_simple("on_proc_ins_end"); callback_simple(id_on_proc_ins_end);
mark = 0; mark = 0;
@ -124,7 +124,7 @@
squote = "'"; squote = "'";
action emit_string { action emit_string {
callback("on_string_body", data, encoding, ts, te); callback(id_on_string_body, data, encoding, ts, te);
if ( lines > 0 ) if ( lines > 0 )
{ {
@ -135,13 +135,13 @@
} }
action start_string_squote { action start_string_squote {
callback_simple("on_string_squote"); callback_simple(id_on_string_squote);
fcall string_squote; fcall string_squote;
} }
action start_string_dquote { action start_string_dquote {
callback_simple("on_string_dquote"); callback_simple(id_on_string_dquote);
fcall string_dquote; fcall string_dquote;
} }
@ -150,7 +150,7 @@
^squote* $count_newlines => emit_string; ^squote* $count_newlines => emit_string;
squote => { squote => {
callback_simple("on_string_squote"); callback_simple(id_on_string_squote);
fret; fret;
}; };
@ -160,7 +160,7 @@
^dquote* $count_newlines => emit_string; ^dquote* $count_newlines => emit_string;
dquote => { dquote => {
callback_simple("on_string_dquote"); callback_simple(id_on_string_dquote);
fret; fret;
}; };
@ -179,14 +179,14 @@
doctype_start = '<!DOCTYPE'i whitespace+; doctype_start = '<!DOCTYPE'i whitespace+;
action start_doctype { action start_doctype {
callback_simple("on_doctype_start"); callback_simple(id_on_doctype_start);
fnext doctype; fnext doctype;
} }
# Machine for processing inline rules of a doctype. # Machine for processing inline rules of a doctype.
doctype_inline := |* doctype_inline := |*
^']'* $count_newlines => { ^']'* $count_newlines => {
callback("on_doctype_inline", data, encoding, ts, te); callback(id_on_doctype_inline, data, encoding, ts, te);
if ( lines > 0 ) if ( lines > 0 )
{ {
@ -203,7 +203,7 @@
# and system IDs are treated as T_STRING tokens. # and system IDs are treated as T_STRING tokens.
doctype := |* doctype := |*
'PUBLIC' | 'SYSTEM' => { 'PUBLIC' | 'SYSTEM' => {
callback("on_doctype_type", data, encoding, ts, te); callback(id_on_doctype_type, data, encoding, ts, te);
}; };
# Starts a set of inline doctype rules. # Starts a set of inline doctype rules.
@ -218,11 +218,11 @@
whitespace; whitespace;
identifier => { identifier => {
callback("on_doctype_name", data, encoding, ts, te); callback(id_on_doctype_name, data, encoding, ts, te);
}; };
'>' => { '>' => {
callback_simple("on_doctype_end"); callback_simple(id_on_doctype_end);
fnext main; fnext main;
}; };
*|; *|;
@ -235,20 +235,20 @@
xml_decl_end = '?>'; xml_decl_end = '?>';
action start_xml_decl { action start_xml_decl {
callback_simple("on_xml_decl_start"); callback_simple(id_on_xml_decl_start);
fnext xml_decl; fnext xml_decl;
} }
# Machine that processes the contents of an XML declaration tag. # Machine that processes the contents of an XML declaration tag.
xml_decl := |* xml_decl := |*
xml_decl_end => { xml_decl_end => {
callback_simple("on_xml_decl_end"); callback_simple(id_on_xml_decl_end);
fnext main; fnext main;
}; };
# Attributes and their values (e.g. version="1.0"). # Attributes and their values (e.g. version="1.0").
identifier => { identifier => {
callback("on_attribute", data, encoding, ts, te); callback(id_on_attribute, data, encoding, ts, te);
}; };
squote => start_string_squote; squote => start_string_squote;
@ -270,23 +270,23 @@
element_end = '</' identifier (':' identifier)* '>'; element_end = '</' identifier (':' identifier)* '>';
action start_element { action start_element {
callback_simple("on_element_start"); callback_simple(id_on_element_start);
fhold; fhold;
fnext element_name; fnext element_name;
} }
action close_element { action close_element {
callback_simple("on_element_end"); callback_simple(id_on_element_end);
} }
# Machine used for lexing the name/namespace of an element. # Machine used for lexing the name/namespace of an element.
element_name := |* element_name := |*
identifier ':' => { identifier ':' => {
callback("on_element_ns", data, encoding, ts, te - 1); callback(id_on_element_ns, data, encoding, ts, te - 1);
}; };
identifier => { identifier => {
callback("on_element_name", data, encoding, ts, te); callback(id_on_element_name, data, encoding, ts, te);
fnext element_head; fnext element_head;
}; };
*|; *|;
@ -297,16 +297,16 @@
whitespace | '='; whitespace | '=';
newline => { newline => {
callback_simple("advance_line"); callback_simple(id_advance_line);
}; };
# Attribute names and namespaces. # Attribute names and namespaces.
identifier ':' => { identifier ':' => {
callback("on_attribute_ns", data, encoding, ts, te - 1); callback(id_on_attribute_ns, data, encoding, ts, te - 1);
}; };
identifier => { identifier => {
callback("on_attribute", data, encoding, ts, te); callback(id_on_attribute, data, encoding, ts, te);
}; };
# Attribute values. # Attribute values.
@ -315,13 +315,13 @@
# We're done with the open tag of the element. # We're done with the open tag of the element.
'>' => { '>' => {
callback_simple("on_element_open_end"); callback_simple(id_on_element_open_end);
fnext main; fnext main;
}; };
# Self closing tags. # Self closing tags.
'/>' => { '/>' => {
callback_simple("on_element_end"); callback_simple(id_on_element_end);
fnext main; fnext main;
}; };
*|; *|;
@ -350,7 +350,7 @@
text := |* text := |*
terminate_text | allowed_text => { terminate_text | allowed_text => {
callback("on_text", data, encoding, ts, te); callback(id_on_text, data, encoding, ts, te);
if ( lines > 0 ) if ( lines > 0 )
{ {
@ -364,7 +364,7 @@
# Text followed by a special tag, such as "foo<!--" # Text followed by a special tag, such as "foo<!--"
allowed_text %{ mark = p; } terminate_text => { allowed_text %{ mark = p; } terminate_text => {
callback("on_text", data, encoding, ts, mark); callback(id_on_text, data, encoding, ts, mark);
p = mark - 1; p = mark - 1;
mark = 0; mark = 0;