Support for IO inputs in the lexer.
Using IO/StringIO objects one can parse large XML files without first having to read the entire file into memory. This can potentially save a lot of memory at the cost of a slightly slower runtime. For IO like instances the lexer will consume the input line by line. If a String is given it's consumed as a whole instead. A small side effect of reading the input line by line is that text such as "foo\nbar" will be lexed as two tokens instead of one. Fixes #19.
This commit is contained in:
parent
6b9d65923a
commit
629dcd3fe6
|
@ -16,6 +16,12 @@ on `ts` and `te`) so the macro ignores this argument.
|
|||
#define callback_simple(name) \
|
||||
liboga_xml_lexer_callback_simple(self, name);
|
||||
|
||||
#define oga_ivar_get(owner, name) \
|
||||
rb_ivar_get(owner, rb_intern(name))
|
||||
|
||||
#define oga_ivar_set(owner, name, value) \
|
||||
rb_ivar_set(owner, rb_intern(name), value)
|
||||
|
||||
%%machine c_lexer;
|
||||
|
||||
/**
|
||||
|
@ -58,20 +64,14 @@ void liboga_xml_lexer_callback_simple(VALUE self, const char *name)
|
|||
%% write data;
|
||||
|
||||
/**
|
||||
* Lexes the input String specified in the instance variable `@data`. Lexed
|
||||
* values have the same encoding as the input value. This instance variable
|
||||
* is set in the Ruby layer of the lexer.
|
||||
* Lexes the String specifies as the method argument. Token values have the
|
||||
* same encoding as the input value.
|
||||
*
|
||||
* The Ragel loop dispatches method calls back to Ruby land to make it easier
|
||||
* to implement complex actions without having to fiddle around with C. This
|
||||
* introduces a small performance overhead compared to a pure C implementation.
|
||||
* However, this is worth the overhead due to it being much easier to maintain.
|
||||
* This method keeps track of an internal state using the instance variables
|
||||
* `@act` and `@cs`.
|
||||
*/
|
||||
VALUE oga_xml_lexer_advance(VALUE self)
|
||||
VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
|
||||
{
|
||||
/* Pull the data in from Ruby land. */
|
||||
VALUE data_block = rb_funcall(self, rb_intern("read_data"), 0);
|
||||
|
||||
/* Make sure that all data passed back to Ruby has the proper encoding. */
|
||||
rb_encoding *encoding = rb_enc_get(data_block);
|
||||
|
||||
|
@ -80,14 +80,28 @@ VALUE oga_xml_lexer_advance(VALUE self)
|
|||
const char *p = data_str_val;
|
||||
const char *pe = data_str_val + strlen(data_str_val);
|
||||
const char *eof = pe;
|
||||
const char *ts, *te;
|
||||
const char *ts = 0;
|
||||
const char *te = 0;
|
||||
|
||||
int act = 0;
|
||||
int cs = 0;
|
||||
int act = NUM2INT(oga_ivar_get(self, "@act"));
|
||||
int cs = NUM2INT(oga_ivar_get(self, "@cs"));
|
||||
|
||||
%% write init;
|
||||
%% write exec;
|
||||
|
||||
oga_ivar_set(self, "@act", INT2NUM(act));
|
||||
oga_ivar_set(self, "@cs", INT2NUM(cs));
|
||||
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the internal state of the lexer.
|
||||
*/
|
||||
VALUE oga_xml_lexer_reset(VALUE self)
|
||||
{
|
||||
oga_ivar_set(self, "@act", INT2NUM(0));
|
||||
oga_ivar_set(self, "@cs", INT2NUM(c_lexer_start));
|
||||
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
|
@ -101,5 +115,6 @@ void Init_liboga_xml_lexer()
|
|||
VALUE mXML = rb_const_get(mOga, rb_intern("XML"));
|
||||
VALUE cLexer = rb_define_class_under(mXML, "Lexer", rb_cObject);
|
||||
|
||||
rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 0);
|
||||
rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1);
|
||||
rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0);
|
||||
}
|
||||
|
|
|
@ -11,6 +11,7 @@ import org.jruby.RubyModule;
|
|||
import org.jruby.RubyClass;
|
||||
import org.jruby.RubyObject;
|
||||
import org.jruby.RubyString;
|
||||
import org.jruby.RubyFixnum;
|
||||
import org.jruby.util.ByteList;
|
||||
import org.jruby.anno.JRubyClass;
|
||||
import org.jruby.anno.JRubyMethod;
|
||||
|
@ -36,6 +37,10 @@ public class Lexer extends RubyObject
|
|||
|
||||
%% write data;
|
||||
|
||||
/* Used by Ragel to keep track of the current state. */
|
||||
int act;
|
||||
int cs;
|
||||
|
||||
/**
|
||||
* Sets up the current class in the Ruby runtime.
|
||||
*/
|
||||
|
@ -79,28 +84,35 @@ public class Lexer extends RubyObject
|
|||
* This method always returns nil.
|
||||
*/
|
||||
@JRubyMethod
|
||||
public IRubyObject advance_native(ThreadContext context)
|
||||
public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
|
||||
{
|
||||
// Pull the data in from Ruby land.
|
||||
RubyString rb_str = (RubyString) this.callMethod(context, "read_data");
|
||||
Encoding encoding = rb_str.getEncoding();
|
||||
|
||||
byte[] data = rb_str.getBytes();
|
||||
|
||||
int act = 0;
|
||||
int cs = 0;
|
||||
int ts = 0;
|
||||
int te = 0;
|
||||
int p = 0;
|
||||
int pe = data.length;
|
||||
int eof = data.length;
|
||||
|
||||
%% write init;
|
||||
%% write exec;
|
||||
|
||||
return context.nil;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the internal state of the lexer.
|
||||
*/
|
||||
@JRubyMethod
|
||||
public IRubyObject reset_native(ThreadContext context)
|
||||
{
|
||||
this.act = 0;
|
||||
this.cs = java_lexer_start;
|
||||
|
||||
return context.nil;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls back in to Ruby land passing the current token value along.
|
||||
*
|
||||
|
@ -131,5 +143,8 @@ public class Lexer extends RubyObject
|
|||
}
|
||||
|
||||
%%{
|
||||
variable act this.act;
|
||||
variable cs this.cs;
|
||||
|
||||
include base_lexer "base_lexer.rl";
|
||||
}%%
|
||||
|
|
|
@ -7,6 +7,33 @@ module Oga
|
|||
#
|
||||
# lexer = Oga::XML::Lexer.new(:html => true)
|
||||
#
|
||||
# This lexer can process both String and IO instances. IO instances are
|
||||
# processed on a line by line basis. This can greatly reduce memory usage
|
||||
# in exchange for a slightly slower runtime.
|
||||
#
|
||||
# ## Thread Safety
|
||||
#
|
||||
# Since this class keeps track of an internal state you can not use the
|
||||
# same instance between multiple threads at the same time. For example, the
|
||||
# following will not work reliably:
|
||||
#
|
||||
# # Don't do this!
|
||||
# lexer = Oga::XML::Lexer.new('....')
|
||||
# threads = []
|
||||
#
|
||||
# 2.times do
|
||||
# threads << Thread.new do
|
||||
# lexer.advance do |*args|
|
||||
# p args
|
||||
# end
|
||||
# end
|
||||
# end
|
||||
#
|
||||
# threads.each(&:join)
|
||||
#
|
||||
# However, it is perfectly save to use different instances per thread.
|
||||
# There is no _global_ state used by this lexer.
|
||||
#
|
||||
# @!attribute [r] html
|
||||
# @return [TrueClass|FalseClass]
|
||||
#
|
||||
|
@ -39,7 +66,8 @@ module Oga
|
|||
])
|
||||
|
||||
##
|
||||
# @param [String] data The data to lex.
|
||||
# @param [String|IO] data The data to lex. This can either be a String or
|
||||
# an IO instance.
|
||||
#
|
||||
# @param [Hash] options
|
||||
#
|
||||
|
@ -62,15 +90,36 @@ module Oga
|
|||
def reset
|
||||
@line = 1
|
||||
@elements = []
|
||||
|
||||
reset_native
|
||||
end
|
||||
|
||||
##
|
||||
# Returns the next block of data to lex.
|
||||
# Yields the data to lex to the supplied block.
|
||||
#
|
||||
# @return [String]
|
||||
# @yieldparam [String]
|
||||
#
|
||||
def read_data
|
||||
return @data
|
||||
# We can't check for #each_line since String also defines that. Using
|
||||
# String#each_line has no benefit over just lexing the String in one
|
||||
# go.
|
||||
if io_input?
|
||||
@data.each_line do |line|
|
||||
yield line
|
||||
end
|
||||
else
|
||||
yield @data
|
||||
end
|
||||
end
|
||||
|
||||
##
|
||||
# Returns `true` if the input is an IO like object, false otherwise.
|
||||
#
|
||||
# @return [TrueClass|FalseClass]
|
||||
#
|
||||
def io_input?
|
||||
return @data.is_a?(IO) || @data.is_a?(StringIO)
|
||||
end
|
||||
|
||||
##
|
||||
|
@ -79,9 +128,8 @@ module Oga
|
|||
# This method resets the internal state of the lexer after consuming the
|
||||
# input.
|
||||
#
|
||||
# @param [String] data The string to consume.
|
||||
# @return [Array]
|
||||
# @see #advance
|
||||
# @return [Array]
|
||||
#
|
||||
def lex
|
||||
tokens = []
|
||||
|
@ -110,14 +158,16 @@ module Oga
|
|||
#
|
||||
# This method does *not* reset the internal state of the lexer.
|
||||
#
|
||||
#
|
||||
# @param [String] data The String to consume.
|
||||
# @return [Array]
|
||||
# @yieldparam [Symbol] type
|
||||
# @yieldparam [String] value
|
||||
# @yieldparam [Fixnum] line
|
||||
#
|
||||
def advance(&block)
|
||||
@block = block
|
||||
|
||||
advance_native
|
||||
read_data do |chunk|
|
||||
advance_native(chunk)
|
||||
end
|
||||
ensure
|
||||
@block = nil
|
||||
end
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::XML::Lexer do
|
||||
context 'IO as input' do
|
||||
example 'lex a paragraph element with attributes' do
|
||||
io = StringIO.new("<p class='foo'>\nHello</p>")
|
||||
|
||||
lex(io).should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'p', 1],
|
||||
[:T_ATTR, 'class', 1],
|
||||
[:T_STRING, 'foo', 1],
|
||||
[:T_TEXT, "\n", 1],
|
||||
[:T_TEXT, 'Hello', 2],
|
||||
[:T_ELEM_END, nil, 2]
|
||||
]
|
||||
end
|
||||
end
|
||||
end
|
|
@ -1,4 +1,5 @@
|
|||
require 'rspec'
|
||||
require 'stringio'
|
||||
|
||||
if ENV['COVERAGE']
|
||||
require_relative 'support/simplecov'
|
||||
|
|
Loading…
Reference in New Issue