Support for IO inputs in the lexer.

Using IO/StringIO objects one can parse large XML files without first having to
read the entire file into memory. This can potentially save a lot of memory at
the cost of a slightly slower runtime.

For IO like instances the lexer will consume the input line by line. If a
String is given it's consumed as a whole instead. A small side effect of
reading the input line by line is that text such as "foo\nbar" will be lexed as
two tokens instead of one.

Fixes #19.
This commit is contained in:
Yorick Peterse 2014-05-26 00:28:38 +02:00
parent 6b9d65923a
commit 629dcd3fe6
5 changed files with 133 additions and 33 deletions

View File

@ -16,6 +16,12 @@ on `ts` and `te`) so the macro ignores this argument.
#define callback_simple(name) \
liboga_xml_lexer_callback_simple(self, name);
#define oga_ivar_get(owner, name) \
rb_ivar_get(owner, rb_intern(name))
#define oga_ivar_set(owner, name, value) \
rb_ivar_set(owner, rb_intern(name), value)
%%machine c_lexer;
/**
@ -58,20 +64,14 @@ void liboga_xml_lexer_callback_simple(VALUE self, const char *name)
%% write data;
/**
* Lexes the input String specified in the instance variable `@data`. Lexed
* values have the same encoding as the input value. This instance variable
* is set in the Ruby layer of the lexer.
* Lexes the String specifies as the method argument. Token values have the
* same encoding as the input value.
*
* The Ragel loop dispatches method calls back to Ruby land to make it easier
* to implement complex actions without having to fiddle around with C. This
* introduces a small performance overhead compared to a pure C implementation.
* However, this is worth the overhead due to it being much easier to maintain.
* This method keeps track of an internal state using the instance variables
* `@act` and `@cs`.
*/
VALUE oga_xml_lexer_advance(VALUE self)
VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
{
/* Pull the data in from Ruby land. */
VALUE data_block = rb_funcall(self, rb_intern("read_data"), 0);
/* Make sure that all data passed back to Ruby has the proper encoding. */
rb_encoding *encoding = rb_enc_get(data_block);
@ -80,14 +80,28 @@ VALUE oga_xml_lexer_advance(VALUE self)
const char *p = data_str_val;
const char *pe = data_str_val + strlen(data_str_val);
const char *eof = pe;
const char *ts, *te;
const char *ts = 0;
const char *te = 0;
int act = 0;
int cs = 0;
int act = NUM2INT(oga_ivar_get(self, "@act"));
int cs = NUM2INT(oga_ivar_get(self, "@cs"));
%% write init;
%% write exec;
oga_ivar_set(self, "@act", INT2NUM(act));
oga_ivar_set(self, "@cs", INT2NUM(cs));
return Qnil;
}
/**
* Resets the internal state of the lexer.
*/
VALUE oga_xml_lexer_reset(VALUE self)
{
oga_ivar_set(self, "@act", INT2NUM(0));
oga_ivar_set(self, "@cs", INT2NUM(c_lexer_start));
return Qnil;
}
@ -101,5 +115,6 @@ void Init_liboga_xml_lexer()
VALUE mXML = rb_const_get(mOga, rb_intern("XML"));
VALUE cLexer = rb_define_class_under(mXML, "Lexer", rb_cObject);
rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 0);
rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1);
rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0);
}

View File

@ -11,6 +11,7 @@ import org.jruby.RubyModule;
import org.jruby.RubyClass;
import org.jruby.RubyObject;
import org.jruby.RubyString;
import org.jruby.RubyFixnum;
import org.jruby.util.ByteList;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
@ -36,6 +37,10 @@ public class Lexer extends RubyObject
%% write data;
/* Used by Ragel to keep track of the current state. */
int act;
int cs;
/**
* Sets up the current class in the Ruby runtime.
*/
@ -79,28 +84,35 @@ public class Lexer extends RubyObject
* This method always returns nil.
*/
@JRubyMethod
public IRubyObject advance_native(ThreadContext context)
public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
{
// Pull the data in from Ruby land.
RubyString rb_str = (RubyString) this.callMethod(context, "read_data");
Encoding encoding = rb_str.getEncoding();
byte[] data = rb_str.getBytes();
int act = 0;
int cs = 0;
int ts = 0;
int te = 0;
int p = 0;
int pe = data.length;
int eof = data.length;
%% write init;
%% write exec;
return context.nil;
}
/**
* Resets the internal state of the lexer.
*/
@JRubyMethod
public IRubyObject reset_native(ThreadContext context)
{
this.act = 0;
this.cs = java_lexer_start;
return context.nil;
}
/**
* Calls back in to Ruby land passing the current token value along.
*
@ -131,5 +143,8 @@ public class Lexer extends RubyObject
}
%%{
variable act this.act;
variable cs this.cs;
include base_lexer "base_lexer.rl";
}%%

View File

@ -7,6 +7,33 @@ module Oga
#
# lexer = Oga::XML::Lexer.new(:html => true)
#
# This lexer can process both String and IO instances. IO instances are
# processed on a line by line basis. This can greatly reduce memory usage
# in exchange for a slightly slower runtime.
#
# ## Thread Safety
#
# Since this class keeps track of an internal state you can not use the
# same instance between multiple threads at the same time. For example, the
# following will not work reliably:
#
# # Don't do this!
# lexer = Oga::XML::Lexer.new('....')
# threads = []
#
# 2.times do
# threads << Thread.new do
# lexer.advance do |*args|
# p args
# end
# end
# end
#
# threads.each(&:join)
#
# However, it is perfectly save to use different instances per thread.
# There is no _global_ state used by this lexer.
#
# @!attribute [r] html
# @return [TrueClass|FalseClass]
#
@ -39,7 +66,8 @@ module Oga
])
##
# @param [String] data The data to lex.
# @param [String|IO] data The data to lex. This can either be a String or
# an IO instance.
#
# @param [Hash] options
#
@ -62,15 +90,36 @@ module Oga
def reset
@line = 1
@elements = []
reset_native
end
##
# Returns the next block of data to lex.
# Yields the data to lex to the supplied block.
#
# @return [String]
# @yieldparam [String]
#
def read_data
return @data
# We can't check for #each_line since String also defines that. Using
# String#each_line has no benefit over just lexing the String in one
# go.
if io_input?
@data.each_line do |line|
yield line
end
else
yield @data
end
end
##
# Returns `true` if the input is an IO like object, false otherwise.
#
# @return [TrueClass|FalseClass]
#
def io_input?
return @data.is_a?(IO) || @data.is_a?(StringIO)
end
##
@ -79,9 +128,8 @@ module Oga
# This method resets the internal state of the lexer after consuming the
# input.
#
# @param [String] data The string to consume.
# @return [Array]
# @see #advance
# @return [Array]
#
def lex
tokens = []
@ -110,14 +158,16 @@ module Oga
#
# This method does *not* reset the internal state of the lexer.
#
#
# @param [String] data The String to consume.
# @return [Array]
# @yieldparam [Symbol] type
# @yieldparam [String] value
# @yieldparam [Fixnum] line
#
def advance(&block)
@block = block
advance_native
read_data do |chunk|
advance_native(chunk)
end
ensure
@block = nil
end

View File

@ -0,0 +1,19 @@
require 'spec_helper'
describe Oga::XML::Lexer do
context 'IO as input' do
example 'lex a paragraph element with attributes' do
io = StringIO.new("<p class='foo'>\nHello</p>")
lex(io).should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'p', 1],
[:T_ATTR, 'class', 1],
[:T_STRING, 'foo', 1],
[:T_TEXT, "\n", 1],
[:T_TEXT, 'Hello', 2],
[:T_ELEM_END, nil, 2]
]
end
end
end

View File

@ -1,4 +1,5 @@
require 'rspec'
require 'stringio'
if ENV['COVERAGE']
require_relative 'support/simplecov'