Stream tokens when lexing.

Instead of returning the tokens as a whole they are now streamed using
XML::Lexer#advance. This method returns the next token upon every call. It uses
a small buffer in case a particular block of text results in multiple tokens.
This commit is contained in:
Yorick Peterse 2014-04-09 22:08:13 +02:00
parent e9bb97d261
commit 8237d5791d
4 changed files with 44 additions and 37 deletions

View File

@ -6,13 +6,14 @@ module Oga
#
class Parser < XML::Parser
##
# @param [String] data
# @param [Hash] options
# @see Oga::XML::Parser#initialize
#
def initialize(options = {})
def initialize(data, options = {})
options = options.merge(:html => true)
super(options)
super(data, options)
end
end # Parser
end # HTML

View File

@ -53,17 +53,21 @@ module Oga
end
##
# @param [String] data The data to lex.
#
# @param [Hash] options
#
# @option options [Symbol] :html When set to `true` the lexer will treat
# the input as HTML instead of SGML/XML. This makes it possible to lex
# HTML void elements such as `<link href="">`.
#
def initialize(options = {})
def initialize(data, options = {})
options.each do |key, value|
instance_variable_set("@#{key}", value) if respond_to?(key)
end
@data = data.unpack('U*')
reset
end
@ -74,7 +78,6 @@ module Oga
#
def reset
@line = 1
@data = nil
@ts = nil
@te = nil
@tokens = []
@ -83,6 +86,9 @@ module Oga
@cs = self.class.lexer_start
@act = 0
@elements = []
@eof = @data.length
@p = 0
@pe = @eof
@buffer_start_position = nil
end
@ -102,8 +108,12 @@ module Oga
# @return [Array]
# @see #advance
#
def lex(data)
tokens = advance(data)
def lex
tokens = []
while token = advance
tokens << token
end
reset
@ -118,16 +128,10 @@ module Oga
# @param [String] data The String to consume.
# @return [Array]
#
def advance(data)
@data = data.unpack('U*')
eof = data.length
p = 0
pe = eof
def advance
%% write exec; # % fix highlight
return @tokens
return @tokens.shift
end
##
@ -244,7 +248,10 @@ module Oga
%%{
# Use instance variables for `ts` and friends.
access @;
getkey (@data[p] || 0);
getkey (@data[@p] || 0);
variable p @p;
variable pe @pe;
variable eof @eof;
newline = '\n' | '\r\n';
whitespace = [ \t];
@ -529,7 +536,7 @@ module Oga
start_buffer(@ts) unless buffering?
# EOF, emit the text buffer.
if @te == eof
if @te == @eof
emit_buffer(@te)
end
};

View File

@ -135,13 +135,14 @@ end
---- inner
##
# @param [String] data The input to parse.
#
# @param [Hash] options
# @see Oga::XML::Lexer#initialize
#
# @option options [TrueClass|FalseClass] :html Enables HTML parsing mode.
# @see Oga::Lexer#initialize
#
def initialize(options = {})
@lexer = Lexer.new(options)
def initialize(data, options = {})
@data = data
@lexer = Lexer.new(data, options)
end
##
@ -172,7 +173,7 @@ end
# @return [Array]
#
def next_token
type, value, line = @tokens.shift
type, value, line = @lexer.advance
@line = line if line
@ -188,11 +189,12 @@ end
def on_error(type, value, stack)
name = token_to_str(type)
index = @line - 1
lines = ''
lines = @data.lines.to_a
code = ''
# Show up to 5 lines before and after the offending line (if they exist).
(-5..5).each do |offset|
line = @lines[index + offset]
line = lines[index + offset]
number = @line + offset
if line and number > 0
@ -202,31 +204,28 @@ end
prefix = ' '
end
lines << "#{prefix}#{number}: #{line.strip}\n"
code << "#{prefix}#{number}: #{line.strip}\n"
end
end
raise Racc::ParseError, <<-EOF.strip
Unexpected #{name} with value #{value.inspect} on line #{@line}:
#{lines}
#{code}
EOF
end
##
# Parses the supplied string and returns the AST.
# Parses the input and returns the corresponding AST.
#
# @example
# parser = Oga::Parser.new
# ast = parser.parse('<foo>bar</foo>')
# parser = Oga::Parser.new('<foo>bar</foo>')
# ast = parser.parse
#
# @param [String] string
# @return [Oga::AST::Node]
#
def parse(string)
@lines = string.lines
@tokens = @lexer.lex(string)
ast = do_parse
def parse
ast = do_parse
reset

View File

@ -19,7 +19,7 @@ module Oga
# @return [Array]
#
def lex(input, options = {})
return Oga::XML::Lexer.new(options).lex(input)
return Oga::XML::Lexer.new(input, options).lex
end
##
@ -30,7 +30,7 @@ module Oga
# @return [Oga::AST::Node]
#
def parse(input, options = {})
return Oga::XML::Parser.new(options).parse(input)
return Oga::XML::Parser.new(input, options).parse
end
##
@ -39,7 +39,7 @@ module Oga
# @see #parse
#
def parse_html(input, options = {})
return Oga::HTML::Parser.new(options).parse(input)
return Oga::HTML::Parser.new(input, options).parse
end
##