Documented the lexer.

This commit is contained in:
Yorick Peterse 2014-03-19 22:05:57 +01:00
parent 192ba9bb54
commit 03774f2788
1 changed files with 91 additions and 0 deletions

View File

@ -2,12 +2,26 @@
module Oga module Oga
## ##
# Low level lexer that supports both XML and HTML (using an extra option). To
# lex HTML input set the `:html` option to `true` when creating an instance
# of the lexer:
#
# lexer = Oga::Lexer.new(:html => true)
#
# @!attribute [r] html
# @return [TrueClass|FalseClass]
# #
class Lexer class Lexer
%% write data; # % %% write data; # %
attr_reader :html attr_reader :html
##
# Names of the HTML void elements that should be handled when HTML lexing
# is enabled.
#
# @return [Array]
#
HTML_VOID_ELEMENTS = [ HTML_VOID_ELEMENTS = [
'area', 'area',
'base', 'base',
@ -37,6 +51,13 @@ module Oga
private(name) private(name)
end end
##
# @param [Hash] options
#
# @option options [Symbol] :html When set to `true` the lexer will treat
# the input as HTML instead of SGML/XML. This makes it possible to lex
# HTML void elements such as `<link href="">`.
#
def initialize(options = {}) def initialize(options = {})
options.each do |key, value| options.each do |key, value|
instance_variable_set("@#{key}", value) if respond_to?(key) instance_variable_set("@#{key}", value) if respond_to?(key)
@ -45,6 +66,10 @@ module Oga
reset reset
end end
##
# Resets the internal state of the lexer. Typically you don't need to call
# this method yourself as its called by #lex after lexing a given String.
#
def reset def reset
@line = 1 @line = 1
@column = 1 @column = 1
@ -60,6 +85,17 @@ module Oga
@text_buffer = '' @text_buffer = ''
end end
##
# Lexes the supplied String and returns an Array of tokens. Each token is
# an Array in the following format:
#
# [TYPE, VALUE]
#
# The type is a symbol, the value is either nil or a String.
#
# @param [String] data The string to lex.
# @return [Array]
#
def lex(data) def lex(data)
@data = data @data = data
lexer_start = self.class.lexer_start lexer_start = self.class.lexer_start
@ -75,31 +111,67 @@ module Oga
return tokens return tokens
end end
##
# @return [TrueClass|FalseClass]
#
def html? def html?
return !!html return !!html
end end
private private
##
# @param [Fixnum] amount The amount of lines to advance.
#
def advance_line(amount = 1) def advance_line(amount = 1)
@line += amount @line += amount
@column = 1 @column = 1
end end
##
# @param [Fixnum] length The amount of columns to advance.
#
def advance_column(length = 1) def advance_column(length = 1)
@column += length @column += length
end end
##
# Emits a token who's value is based on the supplied start/stop position.
#
# @param [Symbol] type The token type.
# @param [Fixnum] start
# @param [Fixnum] stop
#
# @see #text
# @see #add_token
#
def t(type, start = @ts, stop = @te) def t(type, start = @ts, stop = @te)
value = text(start, stop) value = text(start, stop)
add_token(type, value) add_token(type, value)
end end
##
# Returns the text of the current buffer based on the supplied start and
# stop position.
#
# By default `@ts` and `@te` are used as the start/stop position.
#
# @param [Fixnum] start
# @param [Fixnum] stop
# @return [String]
#
def text(start = @ts, stop = @te) def text(start = @ts, stop = @te)
return @data[start...stop] return @data[start...stop]
end end
##
# Adds a token with the given type and value to the list. If a value is
# given the column number is also advanced based on the value's length.
#
# @param [Symbol] type The token type.
# @param [String] value The token value.
#
def add_token(type, value) def add_token(type, value)
token = [type, value, @line, @column] token = [type, value, @line, @column]
@ -108,6 +180,10 @@ module Oga
@tokens << token @tokens << token
end end
##
# Emits the current text buffer if we have any. The current line number is
# advanced based on the amount of newlines in the buffer.
#
def emit_text_buffer def emit_text_buffer
return if @text_buffer.empty? return if @text_buffer.empty?
@ -120,12 +196,22 @@ module Oga
@text_buffer = '' @text_buffer = ''
end end
##
# Buffers text until the current token position hits the EOF position. Once
# this position is reached the buffer is emitted.
#
# @param [Fixnum] eof The EOF position.
# @see #emit_text_buffer
#
def buffer_text_until_eof(eof) def buffer_text_until_eof(eof)
@text_buffer << text @text_buffer << text
emit_text_buffer if @te == eof emit_text_buffer if @te == eof
end end
##
# Emits and resets the current string buffer.
#
def emit_string_buffer def emit_string_buffer
add_token(:T_STRING, @string_buffer) add_token(:T_STRING, @string_buffer)
advance_column advance_column
@ -133,6 +219,11 @@ module Oga
@string_buffer = '' @string_buffer = ''
end end
##
# Returns the name of the element we're currently in.
#
# @return [String]
#
def current_element def current_element
return @elements.last return @elements.last
end end