Stream tokens when lexing.

Instead of returning the tokens as a whole they are now streamed using
XML::Lexer#advance. This method returns the next token upon every call. It uses
a small buffer in case a particular block of text results in multiple tokens.
This commit is contained in:
Yorick Peterse 2014-04-09 22:08:13 +02:00
parent e9bb97d261
commit 8237d5791d
4 changed files with 44 additions and 37 deletions

View File

@ -6,13 +6,14 @@ module Oga
# #
class Parser < XML::Parser class Parser < XML::Parser
## ##
# @param [String] data
# @param [Hash] options # @param [Hash] options
# @see Oga::XML::Parser#initialize # @see Oga::XML::Parser#initialize
# #
def initialize(options = {}) def initialize(data, options = {})
options = options.merge(:html => true) options = options.merge(:html => true)
super(options) super(data, options)
end end
end # Parser end # Parser
end # HTML end # HTML

View File

@ -53,17 +53,21 @@ module Oga
end end
## ##
# @param [String] data The data to lex.
#
# @param [Hash] options # @param [Hash] options
# #
# @option options [Symbol] :html When set to `true` the lexer will treat # @option options [Symbol] :html When set to `true` the lexer will treat
# the input as HTML instead of SGML/XML. This makes it possible to lex # the input as HTML instead of SGML/XML. This makes it possible to lex
# HTML void elements such as `<link href="">`. # HTML void elements such as `<link href="">`.
# #
def initialize(options = {}) def initialize(data, options = {})
options.each do |key, value| options.each do |key, value|
instance_variable_set("@#{key}", value) if respond_to?(key) instance_variable_set("@#{key}", value) if respond_to?(key)
end end
@data = data.unpack('U*')
reset reset
end end
@ -74,7 +78,6 @@ module Oga
# #
def reset def reset
@line = 1 @line = 1
@data = nil
@ts = nil @ts = nil
@te = nil @te = nil
@tokens = [] @tokens = []
@ -83,6 +86,9 @@ module Oga
@cs = self.class.lexer_start @cs = self.class.lexer_start
@act = 0 @act = 0
@elements = [] @elements = []
@eof = @data.length
@p = 0
@pe = @eof
@buffer_start_position = nil @buffer_start_position = nil
end end
@ -102,8 +108,12 @@ module Oga
# @return [Array] # @return [Array]
# @see #advance # @see #advance
# #
def lex(data) def lex
tokens = advance(data) tokens = []
while token = advance
tokens << token
end
reset reset
@ -118,16 +128,10 @@ module Oga
# @param [String] data The String to consume. # @param [String] data The String to consume.
# @return [Array] # @return [Array]
# #
def advance(data) def advance
@data = data.unpack('U*')
eof = data.length
p = 0
pe = eof
%% write exec; # % fix highlight %% write exec; # % fix highlight
return @tokens return @tokens.shift
end end
## ##
@ -244,7 +248,10 @@ module Oga
%%{ %%{
# Use instance variables for `ts` and friends. # Use instance variables for `ts` and friends.
access @; access @;
getkey (@data[p] || 0); getkey (@data[@p] || 0);
variable p @p;
variable pe @pe;
variable eof @eof;
newline = '\n' | '\r\n'; newline = '\n' | '\r\n';
whitespace = [ \t]; whitespace = [ \t];
@ -529,7 +536,7 @@ module Oga
start_buffer(@ts) unless buffering? start_buffer(@ts) unless buffering?
# EOF, emit the text buffer. # EOF, emit the text buffer.
if @te == eof if @te == @eof
emit_buffer(@te) emit_buffer(@te)
end end
}; };

View File

@ -135,13 +135,14 @@ end
---- inner ---- inner
## ##
# @param [String] data The input to parse.
#
# @param [Hash] options # @param [Hash] options
# @see Oga::XML::Lexer#initialize
# #
# @option options [TrueClass|FalseClass] :html Enables HTML parsing mode. def initialize(data, options = {})
# @see Oga::Lexer#initialize @data = data
# @lexer = Lexer.new(data, options)
def initialize(options = {})
@lexer = Lexer.new(options)
end end
## ##
@ -172,7 +173,7 @@ end
# @return [Array] # @return [Array]
# #
def next_token def next_token
type, value, line = @tokens.shift type, value, line = @lexer.advance
@line = line if line @line = line if line
@ -188,11 +189,12 @@ end
def on_error(type, value, stack) def on_error(type, value, stack)
name = token_to_str(type) name = token_to_str(type)
index = @line - 1 index = @line - 1
lines = '' lines = @data.lines.to_a
code = ''
# Show up to 5 lines before and after the offending line (if they exist). # Show up to 5 lines before and after the offending line (if they exist).
(-5..5).each do |offset| (-5..5).each do |offset|
line = @lines[index + offset] line = lines[index + offset]
number = @line + offset number = @line + offset
if line and number > 0 if line and number > 0
@ -202,30 +204,27 @@ end
prefix = ' ' prefix = ' '
end end
lines << "#{prefix}#{number}: #{line.strip}\n" code << "#{prefix}#{number}: #{line.strip}\n"
end end
end end
raise Racc::ParseError, <<-EOF.strip raise Racc::ParseError, <<-EOF.strip
Unexpected #{name} with value #{value.inspect} on line #{@line}: Unexpected #{name} with value #{value.inspect} on line #{@line}:
#{lines} #{code}
EOF EOF
end end
## ##
# Parses the supplied string and returns the AST. # Parses the input and returns the corresponding AST.
# #
# @example # @example
# parser = Oga::Parser.new # parser = Oga::Parser.new('<foo>bar</foo>')
# ast = parser.parse('<foo>bar</foo>') # ast = parser.parse
# #
# @param [String] string
# @return [Oga::AST::Node] # @return [Oga::AST::Node]
# #
def parse(string) def parse
@lines = string.lines
@tokens = @lexer.lex(string)
ast = do_parse ast = do_parse
reset reset

View File

@ -19,7 +19,7 @@ module Oga
# @return [Array] # @return [Array]
# #
def lex(input, options = {}) def lex(input, options = {})
return Oga::XML::Lexer.new(options).lex(input) return Oga::XML::Lexer.new(input, options).lex
end end
## ##
@ -30,7 +30,7 @@ module Oga
# @return [Oga::AST::Node] # @return [Oga::AST::Node]
# #
def parse(input, options = {}) def parse(input, options = {})
return Oga::XML::Parser.new(options).parse(input) return Oga::XML::Parser.new(input, options).parse
end end
## ##
@ -39,7 +39,7 @@ module Oga
# @see #parse # @see #parse
# #
def parse_html(input, options = {}) def parse_html(input, options = {})
return Oga::HTML::Parser.new(options).parse(input) return Oga::HTML::Parser.new(input, options).parse
end end
## ##