From 629dcd3fe687358c78d16cdb0f6262a9436eaf22 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Mon, 26 May 2014 00:28:38 +0200 Subject: [PATCH] Support for IO inputs in the lexer. Using IO/StringIO objects one can parse large XML files without first having to read the entire file into memory. This can potentially save a lot of memory at the cost of a slightly slower runtime. For IO like instances the lexer will consume the input line by line. If a String is given it's consumed as a whole instead. A small side effect of reading the input line by line is that text such as "foo\nbar" will be lexed as two tokens instead of one. Fixes #19. --- ext/c/lexer.rl | 47 ++++++++++++++------- ext/java/org/liboga/xml/Lexer.rl | 27 +++++++++--- lib/oga/xml/lexer.rb | 72 +++++++++++++++++++++++++++----- spec/oga/xml/lexer/io_spec.rb | 19 +++++++++ spec/spec_helper.rb | 1 + 5 files changed, 133 insertions(+), 33 deletions(-) create mode 100644 spec/oga/xml/lexer/io_spec.rb diff --git a/ext/c/lexer.rl b/ext/c/lexer.rl index f239f1b..cada319 100644 --- a/ext/c/lexer.rl +++ b/ext/c/lexer.rl @@ -16,6 +16,12 @@ on `ts` and `te`) so the macro ignores this argument. #define callback_simple(name) \ liboga_xml_lexer_callback_simple(self, name); +#define oga_ivar_get(owner, name) \ + rb_ivar_get(owner, rb_intern(name)) + +#define oga_ivar_set(owner, name, value) \ + rb_ivar_set(owner, rb_intern(name), value) + %%machine c_lexer; /** @@ -58,20 +64,14 @@ void liboga_xml_lexer_callback_simple(VALUE self, const char *name) %% write data; /** - * Lexes the input String specified in the instance variable `@data`. Lexed - * values have the same encoding as the input value. This instance variable - * is set in the Ruby layer of the lexer. + * Lexes the String specifies as the method argument. Token values have the + * same encoding as the input value. * - * The Ragel loop dispatches method calls back to Ruby land to make it easier - * to implement complex actions without having to fiddle around with C. This - * introduces a small performance overhead compared to a pure C implementation. - * However, this is worth the overhead due to it being much easier to maintain. + * This method keeps track of an internal state using the instance variables + * `@act` and `@cs`. */ -VALUE oga_xml_lexer_advance(VALUE self) +VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block) { - /* Pull the data in from Ruby land. */ - VALUE data_block = rb_funcall(self, rb_intern("read_data"), 0); - /* Make sure that all data passed back to Ruby has the proper encoding. */ rb_encoding *encoding = rb_enc_get(data_block); @@ -80,14 +80,28 @@ VALUE oga_xml_lexer_advance(VALUE self) const char *p = data_str_val; const char *pe = data_str_val + strlen(data_str_val); const char *eof = pe; - const char *ts, *te; + const char *ts = 0; + const char *te = 0; - int act = 0; - int cs = 0; + int act = NUM2INT(oga_ivar_get(self, "@act")); + int cs = NUM2INT(oga_ivar_get(self, "@cs")); - %% write init; %% write exec; + oga_ivar_set(self, "@act", INT2NUM(act)); + oga_ivar_set(self, "@cs", INT2NUM(cs)); + + return Qnil; +} + +/** + * Resets the internal state of the lexer. + */ +VALUE oga_xml_lexer_reset(VALUE self) +{ + oga_ivar_set(self, "@act", INT2NUM(0)); + oga_ivar_set(self, "@cs", INT2NUM(c_lexer_start)); + return Qnil; } @@ -101,5 +115,6 @@ void Init_liboga_xml_lexer() VALUE mXML = rb_const_get(mOga, rb_intern("XML")); VALUE cLexer = rb_define_class_under(mXML, "Lexer", rb_cObject); - rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 0); + rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1); + rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0); } diff --git a/ext/java/org/liboga/xml/Lexer.rl b/ext/java/org/liboga/xml/Lexer.rl index d6f10f7..8300972 100644 --- a/ext/java/org/liboga/xml/Lexer.rl +++ b/ext/java/org/liboga/xml/Lexer.rl @@ -11,6 +11,7 @@ import org.jruby.RubyModule; import org.jruby.RubyClass; import org.jruby.RubyObject; import org.jruby.RubyString; +import org.jruby.RubyFixnum; import org.jruby.util.ByteList; import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; @@ -36,6 +37,10 @@ public class Lexer extends RubyObject %% write data; + /* Used by Ragel to keep track of the current state. */ + int act; + int cs; + /** * Sets up the current class in the Ruby runtime. */ @@ -79,28 +84,35 @@ public class Lexer extends RubyObject * This method always returns nil. */ @JRubyMethod - public IRubyObject advance_native(ThreadContext context) + public IRubyObject advance_native(ThreadContext context, RubyString rb_str) { - // Pull the data in from Ruby land. - RubyString rb_str = (RubyString) this.callMethod(context, "read_data"); Encoding encoding = rb_str.getEncoding(); byte[] data = rb_str.getBytes(); - int act = 0; - int cs = 0; int ts = 0; int te = 0; int p = 0; int pe = data.length; int eof = data.length; - %% write init; %% write exec; return context.nil; } + /** + * Resets the internal state of the lexer. + */ + @JRubyMethod + public IRubyObject reset_native(ThreadContext context) + { + this.act = 0; + this.cs = java_lexer_start; + + return context.nil; + } + /** * Calls back in to Ruby land passing the current token value along. * @@ -131,5 +143,8 @@ public class Lexer extends RubyObject } %%{ + variable act this.act; + variable cs this.cs; + include base_lexer "base_lexer.rl"; }%% diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb index 1d73132..e3a3ac3 100644 --- a/lib/oga/xml/lexer.rb +++ b/lib/oga/xml/lexer.rb @@ -7,6 +7,33 @@ module Oga # # lexer = Oga::XML::Lexer.new(:html => true) # + # This lexer can process both String and IO instances. IO instances are + # processed on a line by line basis. This can greatly reduce memory usage + # in exchange for a slightly slower runtime. + # + # ## Thread Safety + # + # Since this class keeps track of an internal state you can not use the + # same instance between multiple threads at the same time. For example, the + # following will not work reliably: + # + # # Don't do this! + # lexer = Oga::XML::Lexer.new('....') + # threads = [] + # + # 2.times do + # threads << Thread.new do + # lexer.advance do |*args| + # p args + # end + # end + # end + # + # threads.each(&:join) + # + # However, it is perfectly save to use different instances per thread. + # There is no _global_ state used by this lexer. + # # @!attribute [r] html # @return [TrueClass|FalseClass] # @@ -39,13 +66,14 @@ module Oga ]) ## - # @param [String] data The data to lex. + # @param [String|IO] data The data to lex. This can either be a String or + # an IO instance. # # @param [Hash] options # # @option options [Symbol] :html When set to `true` the lexer will treat - # the input as HTML instead of SGML/XML. This makes it possible to lex - # HTML void elements such as ``. + # the input as HTML instead of SGML/XML. This makes it possible to lex + # HTML void elements such as ``. # def initialize(data, options = {}) @data = data @@ -62,15 +90,36 @@ module Oga def reset @line = 1 @elements = [] + + reset_native end ## - # Returns the next block of data to lex. + # Yields the data to lex to the supplied block. # # @return [String] + # @yieldparam [String] # def read_data - return @data + # We can't check for #each_line since String also defines that. Using + # String#each_line has no benefit over just lexing the String in one + # go. + if io_input? + @data.each_line do |line| + yield line + end + else + yield @data + end + end + + ## + # Returns `true` if the input is an IO like object, false otherwise. + # + # @return [TrueClass|FalseClass] + # + def io_input? + return @data.is_a?(IO) || @data.is_a?(StringIO) end ## @@ -79,9 +128,8 @@ module Oga # This method resets the internal state of the lexer after consuming the # input. # - # @param [String] data The string to consume. - # @return [Array] # @see #advance + # @return [Array] # def lex tokens = [] @@ -110,14 +158,16 @@ module Oga # # This method does *not* reset the internal state of the lexer. # - # - # @param [String] data The String to consume. - # @return [Array] + # @yieldparam [Symbol] type + # @yieldparam [String] value + # @yieldparam [Fixnum] line # def advance(&block) @block = block - advance_native + read_data do |chunk| + advance_native(chunk) + end ensure @block = nil end diff --git a/spec/oga/xml/lexer/io_spec.rb b/spec/oga/xml/lexer/io_spec.rb new file mode 100644 index 0000000..d3a1a97 --- /dev/null +++ b/spec/oga/xml/lexer/io_spec.rb @@ -0,0 +1,19 @@ +require 'spec_helper' + +describe Oga::XML::Lexer do + context 'IO as input' do + example 'lex a paragraph element with attributes' do + io = StringIO.new("

\nHello

") + + lex(io).should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'p', 1], + [:T_ATTR, 'class', 1], + [:T_STRING, 'foo', 1], + [:T_TEXT, "\n", 1], + [:T_TEXT, 'Hello', 2], + [:T_ELEM_END, nil, 2] + ] + end + end +end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index ef1c8d4..8413653 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -1,4 +1,5 @@ require 'rspec' +require 'stringio' if ENV['COVERAGE'] require_relative 'support/simplecov'