Count newlines of text nodes in native code.
Instead of relying on String#count for counting newlines in text nodes, Oga now
does this in C/Java. String#count isn't exactly the fastest way of counting
characters. Performance was measured using
benchmark/xml/lexer/string_average_bench.rb. Before this patch the results were
as following:
    MRI:   0.529s
    Rbx:   4.965s
    JRuby: 0.622s
After this patch:
    MRI:   0.424s
    Rbx:   1.942s
    JRuby: 0.665s => numbers vary a bit, seem roughly the same as before
The commands used for benchmarking:
    $ rake clean # to make sure that C exts aren't shared between MRI/Rbx
    $ rake generate
    $ rake fixtures
    $ ruby benchmark/xml/lexer/string_average_bench.rb
The big difference for Rbx is probably due to the implementation of String#count
not being super fast. Some changes were made
(https://github.com/rubinius/rubinius/pull/3133) to the method, but this hasn't
been released yet.
JRuby seems to perform in a similar way, so either it was already optimizing
things for me or I suck at writing well performing Java code.
This fixes #51.
			
			
This commit is contained in:
		
							parent
							
								
									4469ffc5b1
								
							
						
					
					
						commit
						8db77c0a09
					
				|  | @ -22,6 +22,9 @@ on `ts` and `te`) so the macro ignores this argument. | ||||||
| #define oga_ivar_set(owner, name, value) \ | #define oga_ivar_set(owner, name, value) \ | ||||||
|     rb_ivar_set(owner, rb_intern(name), value) |     rb_ivar_set(owner, rb_intern(name), value) | ||||||
| 
 | 
 | ||||||
|  | #define advance_line(amount) \ | ||||||
|  |     rb_funcall(self, rb_intern("advance_line"), 1, INT2NUM(amount)); | ||||||
|  | 
 | ||||||
| %%machine c_lexer; | %%machine c_lexer; | ||||||
| 
 | 
 | ||||||
| /** | /** | ||||||
|  | @ -84,8 +87,9 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block) | ||||||
|     const char *te   = 0; |     const char *te   = 0; | ||||||
|     const char *mark = 0; |     const char *mark = 0; | ||||||
| 
 | 
 | ||||||
|     int act = NUM2INT(oga_ivar_get(self, "@act")); |     int act   = NUM2INT(oga_ivar_get(self, "@act")); | ||||||
|     int cs  = NUM2INT(oga_ivar_get(self, "@cs")); |     int cs    = NUM2INT(oga_ivar_get(self, "@cs")); | ||||||
|  |     int lines = 0; | ||||||
| 
 | 
 | ||||||
|     %% write exec; |     %% write exec; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -90,12 +90,13 @@ public class Lexer extends RubyObject | ||||||
| 
 | 
 | ||||||
|         byte[] data = rb_str.getBytes(); |         byte[] data = rb_str.getBytes(); | ||||||
| 
 | 
 | ||||||
|         int ts   = 0; |         int ts    = 0; | ||||||
|         int te   = 0; |         int te    = 0; | ||||||
|         int p    = 0; |         int p     = 0; | ||||||
|         int mark = 0; |         int mark  = 0; | ||||||
|         int pe   = data.length; |         int lines = 0; | ||||||
|         int eof  = data.length; |         int pe    = data.length; | ||||||
|  |         int eof   = data.length; | ||||||
| 
 | 
 | ||||||
|         %% write exec; |         %% write exec; | ||||||
| 
 | 
 | ||||||
|  | @ -141,6 +142,17 @@ public class Lexer extends RubyObject | ||||||
| 
 | 
 | ||||||
|         this.callMethod(context, name); |         this.callMethod(context, name); | ||||||
|     } |     } | ||||||
|  | 
 | ||||||
|  |     /** | ||||||
|  |      * Advances the line number by `amount` lines. | ||||||
|  |      */ | ||||||
|  |     public void advance_line(int amount) | ||||||
|  |     { | ||||||
|  |         ThreadContext context = this.runtime.getCurrentContext(); | ||||||
|  |         RubyFixnum lines      = this.runtime.newFixnum(amount); | ||||||
|  | 
 | ||||||
|  |         this.callMethod(context, "advance_line", lines); | ||||||
|  |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| %%{ | %%{ | ||||||
|  |  | ||||||
|  | @ -35,7 +35,12 @@ | ||||||
|     # stack. |     # stack. | ||||||
|     # |     # | ||||||
| 
 | 
 | ||||||
|     newline    = '\n' | '\r\n'; |     newline = '\n' | '\r\n'; | ||||||
|  | 
 | ||||||
|  |     action count_newlines { | ||||||
|  |         if ( fc == '\n' ) lines++; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     whitespace = [ \t]; |     whitespace = [ \t]; | ||||||
|     ident_char = [a-zA-Z0-9\-_]; |     ident_char = [a-zA-Z0-9\-_]; | ||||||
|     identifier = ident_char+; |     identifier = ident_char+; | ||||||
|  | @ -289,14 +294,19 @@ | ||||||
|     # long. Because of this "<!" is used instead of "<!--". |     # long. Because of this "<!" is used instead of "<!--". | ||||||
| 
 | 
 | ||||||
|     terminate_text = '</' | '<!' | '<?' | element_start; |     terminate_text = '</' | '<!' | '<?' | element_start; | ||||||
|     allowed_text   = any* -- terminate_text; |     allowed_text   = (any* -- terminate_text) $count_newlines; | ||||||
| 
 | 
 | ||||||
|     text := |* |     text := |* | ||||||
|         # Input such as just "</" or "<?". This rule takes precedence over the |         terminate_text | allowed_text => { | ||||||
|         # rules below, but only if those don't match. |  | ||||||
|         terminate_text => { |  | ||||||
|             callback("on_text", data, encoding, ts, te); |             callback("on_text", data, encoding, ts, te); | ||||||
| 
 | 
 | ||||||
|  |             if ( lines > 0 ) | ||||||
|  |             { | ||||||
|  |                 advance_line(lines); | ||||||
|  | 
 | ||||||
|  |                 lines = 0; | ||||||
|  |             } | ||||||
|  | 
 | ||||||
|             fnext main; |             fnext main; | ||||||
|         }; |         }; | ||||||
| 
 | 
 | ||||||
|  | @ -307,12 +317,13 @@ | ||||||
|             p    = mark - 1; |             p    = mark - 1; | ||||||
|             mark = 0; |             mark = 0; | ||||||
| 
 | 
 | ||||||
|             fnext main; |             if ( lines > 0 ) | ||||||
|         }; |             { | ||||||
|  |                 advance_line(lines); | ||||||
|  | 
 | ||||||
|  |                 lines = 0; | ||||||
|  |             } | ||||||
| 
 | 
 | ||||||
|         # Just regular text. |  | ||||||
|         allowed_text => { |  | ||||||
|             callback("on_text", data, encoding, ts, te); |  | ||||||
|             fnext main; |             fnext main; | ||||||
|         }; |         }; | ||||||
|     *|; |     *|; | ||||||
|  |  | ||||||
|  | @ -348,13 +348,7 @@ module Oga | ||||||
|       # @param [String] value |       # @param [String] value | ||||||
|       # |       # | ||||||
|       def on_text(value) |       def on_text(value) | ||||||
|         unless value.empty? |         add_token(:T_TEXT, value) unless value.empty? | ||||||
|           add_token(:T_TEXT, value) |  | ||||||
| 
 |  | ||||||
|           lines = value.count("\n") |  | ||||||
| 
 |  | ||||||
|           advance_line(lines) if lines > 0 |  | ||||||
|         end |  | ||||||
|       end |       end | ||||||
| 
 | 
 | ||||||
|       ## |       ## | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue