From 8237d5791d683f47840f4764e8b4edc227cdb1c4 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Wed, 9 Apr 2014 22:08:13 +0200 Subject: [PATCH] Stream tokens when lexing. Instead of returning the tokens as a whole they are now streamed using XML::Lexer#advance. This method returns the next token upon every call. It uses a small buffer in case a particular block of text results in multiple tokens. --- lib/oga/html/parser.rb | 5 +++-- lib/oga/xml/lexer.rl | 35 +++++++++++++++++++++-------------- lib/oga/xml/parser.y | 35 +++++++++++++++++------------------ spec/support/parsing.rb | 6 +++--- 4 files changed, 44 insertions(+), 37 deletions(-) diff --git a/lib/oga/html/parser.rb b/lib/oga/html/parser.rb index bc12cc1..2395269 100644 --- a/lib/oga/html/parser.rb +++ b/lib/oga/html/parser.rb @@ -6,13 +6,14 @@ module Oga # class Parser < XML::Parser ## + # @param [String] data # @param [Hash] options # @see Oga::XML::Parser#initialize # - def initialize(options = {}) + def initialize(data, options = {}) options = options.merge(:html => true) - super(options) + super(data, options) end end # Parser end # HTML diff --git a/lib/oga/xml/lexer.rl b/lib/oga/xml/lexer.rl index 75667de..6afa0c4 100644 --- a/lib/oga/xml/lexer.rl +++ b/lib/oga/xml/lexer.rl @@ -53,17 +53,21 @@ module Oga end ## + # @param [String] data The data to lex. + # # @param [Hash] options # # @option options [Symbol] :html When set to `true` the lexer will treat # the input as HTML instead of SGML/XML. This makes it possible to lex # HTML void elements such as ``. # - def initialize(options = {}) + def initialize(data, options = {}) options.each do |key, value| instance_variable_set("@#{key}", value) if respond_to?(key) end + @data = data.unpack('U*') + reset end @@ -74,7 +78,6 @@ module Oga # def reset @line = 1 - @data = nil @ts = nil @te = nil @tokens = [] @@ -83,6 +86,9 @@ module Oga @cs = self.class.lexer_start @act = 0 @elements = [] + @eof = @data.length + @p = 0 + @pe = @eof @buffer_start_position = nil end @@ -102,8 +108,12 @@ module Oga # @return [Array] # @see #advance # - def lex(data) - tokens = advance(data) + def lex + tokens = [] + + while token = advance + tokens << token + end reset @@ -118,16 +128,10 @@ module Oga # @param [String] data The String to consume. # @return [Array] # - def advance(data) - @data = data.unpack('U*') - eof = data.length - - p = 0 - pe = eof - + def advance %% write exec; # % fix highlight - return @tokens + return @tokens.shift end ## @@ -244,7 +248,10 @@ module Oga %%{ # Use instance variables for `ts` and friends. access @; - getkey (@data[p] || 0); + getkey (@data[@p] || 0); + variable p @p; + variable pe @pe; + variable eof @eof; newline = '\n' | '\r\n'; whitespace = [ \t]; @@ -529,7 +536,7 @@ module Oga start_buffer(@ts) unless buffering? # EOF, emit the text buffer. - if @te == eof + if @te == @eof emit_buffer(@te) end }; diff --git a/lib/oga/xml/parser.y b/lib/oga/xml/parser.y index ed555b5..27b366d 100644 --- a/lib/oga/xml/parser.y +++ b/lib/oga/xml/parser.y @@ -135,13 +135,14 @@ end ---- inner ## + # @param [String] data The input to parse. + # # @param [Hash] options + # @see Oga::XML::Lexer#initialize # - # @option options [TrueClass|FalseClass] :html Enables HTML parsing mode. - # @see Oga::Lexer#initialize - # - def initialize(options = {}) - @lexer = Lexer.new(options) + def initialize(data, options = {}) + @data = data + @lexer = Lexer.new(data, options) end ## @@ -172,7 +173,7 @@ end # @return [Array] # def next_token - type, value, line = @tokens.shift + type, value, line = @lexer.advance @line = line if line @@ -188,11 +189,12 @@ end def on_error(type, value, stack) name = token_to_str(type) index = @line - 1 - lines = '' + lines = @data.lines.to_a + code = '' # Show up to 5 lines before and after the offending line (if they exist). (-5..5).each do |offset| - line = @lines[index + offset] + line = lines[index + offset] number = @line + offset if line and number > 0 @@ -202,31 +204,28 @@ end prefix = ' ' end - lines << "#{prefix}#{number}: #{line.strip}\n" + code << "#{prefix}#{number}: #{line.strip}\n" end end raise Racc::ParseError, <<-EOF.strip Unexpected #{name} with value #{value.inspect} on line #{@line}: -#{lines} +#{code} EOF end ## - # Parses the supplied string and returns the AST. + # Parses the input and returns the corresponding AST. # # @example - # parser = Oga::Parser.new - # ast = parser.parse('bar') + # parser = Oga::Parser.new('bar') + # ast = parser.parse # - # @param [String] string # @return [Oga::AST::Node] # - def parse(string) - @lines = string.lines - @tokens = @lexer.lex(string) - ast = do_parse + def parse + ast = do_parse reset diff --git a/spec/support/parsing.rb b/spec/support/parsing.rb index 9e347bd..5486708 100644 --- a/spec/support/parsing.rb +++ b/spec/support/parsing.rb @@ -19,7 +19,7 @@ module Oga # @return [Array] # def lex(input, options = {}) - return Oga::XML::Lexer.new(options).lex(input) + return Oga::XML::Lexer.new(input, options).lex end ## @@ -30,7 +30,7 @@ module Oga # @return [Oga::AST::Node] # def parse(input, options = {}) - return Oga::XML::Parser.new(options).parse(input) + return Oga::XML::Parser.new(input, options).parse end ## @@ -39,7 +39,7 @@ module Oga # @see #parse # def parse_html(input, options = {}) - return Oga::HTML::Parser.new(options).parse(input) + return Oga::HTML::Parser.new(input, options).parse end ##