518 lines
13 KiB
Ruby
518 lines
13 KiB
Ruby
module Oga
|
|
module XML
|
|
##
|
|
# Low level lexer that supports both XML and HTML (using an extra option).
|
|
# To lex HTML input set the `:html` option to `true` when creating an
|
|
# instance of the lexer:
|
|
#
|
|
# lexer = Oga::XML::Lexer.new(:html => true)
|
|
#
|
|
# This lexer can process both String and IO instances. IO instances are
|
|
# processed on a line by line basis. This can greatly reduce memory usage
|
|
# in exchange for a slightly slower runtime.
|
|
#
|
|
# ## Thread Safety
|
|
#
|
|
# Since this class keeps track of an internal state you can not use the
|
|
# same instance between multiple threads at the same time. For example, the
|
|
# following will not work reliably:
|
|
#
|
|
# # Don't do this!
|
|
# lexer = Oga::XML::Lexer.new('....')
|
|
# threads = []
|
|
#
|
|
# 2.times do
|
|
# threads << Thread.new do
|
|
# lexer.advance do |*args|
|
|
# p args
|
|
# end
|
|
# end
|
|
# end
|
|
#
|
|
# threads.each(&:join)
|
|
#
|
|
# However, it is perfectly save to use different instances per thread.
|
|
# There is no _global_ state used by this lexer.
|
|
#
|
|
# @!attribute [r] html
|
|
# @return [TrueClass|FalseClass]
|
|
#
|
|
class Lexer
|
|
attr_reader :html
|
|
|
|
# These are all constant/frozen to remove the need for String allocations
|
|
# every time they are referenced in the lexer.
|
|
HTML_SCRIPT = 'script'.freeze
|
|
HTML_STYLE = 'style'.freeze
|
|
|
|
# Elements that should be closed automatically before a new opening tag is
|
|
# processed.
|
|
HTML_CLOSE_SELF = {
|
|
'html' => NodeNameSet.new(%w{html}),
|
|
'head' => NodeNameSet.new(%w{head body}),
|
|
'body' => NodeNameSet.new(%w{body head}),
|
|
'base' => NodeNameSet.new(%w{base}),
|
|
'link' => NodeNameSet.new(%w{link}),
|
|
'meta' => NodeNameSet.new(%w{meta}),
|
|
'noscript' => NodeNameSet.new(%w{noscript}),
|
|
'template' => NodeNameSet.new(%w{template}),
|
|
'title' => NodeNameSet.new(%w{title}),
|
|
'li' => NodeNameSet.new(%w{li}),
|
|
'dt' => NodeNameSet.new(%w{dt dd}),
|
|
'dd' => NodeNameSet.new(%w{dd dt}),
|
|
'rb' => NodeNameSet.new(%w{rb rt rtc rp}),
|
|
'rt' => NodeNameSet.new(%w{rb rt rtc rp}),
|
|
'rtc' => NodeNameSet.new(%w{rb rtc rp}),
|
|
'rp' => NodeNameSet.new(%w{rb rt rtc rp}),
|
|
'optgroup' => NodeNameSet.new(%w{optgroup}),
|
|
'option' => NodeNameSet.new(%w{option optgroup}),
|
|
'colgroup' => NodeNameSet.new(%w{thead tbody tfoot colgroup tr}),
|
|
'caption' => NodeNameSet.new(%w{thead tbody tfoot colgroup tr caption}),
|
|
'thead' => NodeNameSet.new(%w{thead tbody tfoot}),
|
|
'tbody' => NodeNameSet.new(%w{thead tbody tfoot}),
|
|
'tfoot' => NodeNameSet.new(%w{thead tbody tfoot}),
|
|
'tr' => NodeNameSet.new(%w{tr tbody thead tfoot}),
|
|
'td' => NodeNameSet.new(%w{td th tbody thead tfoot tr}),
|
|
'th' => NodeNameSet.new(%w{td th tbody thead tfoot tr}),
|
|
'p' => NodeNameSet.new(%w{
|
|
address article aside blockquote div dl fieldset footer form h1 h2 h3
|
|
h4 h5 h6 header hgroup hr main nav ol p pre section table ul
|
|
})
|
|
}
|
|
|
|
HTML_CLOSE_SELF.keys.each do |key|
|
|
HTML_CLOSE_SELF[key.upcase] = HTML_CLOSE_SELF[key]
|
|
end
|
|
|
|
##
|
|
# Names of HTML tags of which the content should be lexed as-is.
|
|
#
|
|
LITERAL_HTML_ELEMENTS = Whitelist.new([HTML_SCRIPT, HTML_STYLE])
|
|
|
|
##
|
|
# @param [String|IO] data The data to lex. This can either be a String or
|
|
# an IO instance.
|
|
#
|
|
# @param [Hash] options
|
|
#
|
|
# @option options [Symbol] :html When set to `true` the lexer will treat
|
|
# the input as HTML instead of SGML/XML. This makes it possible to lex
|
|
# HTML void elements such as `<link href="">`.
|
|
#
|
|
def initialize(data, options = {})
|
|
@data = data
|
|
@html = options[:html]
|
|
|
|
reset
|
|
end
|
|
|
|
##
|
|
# Resets the internal state of the lexer. Typically you don't need to
|
|
# call this method yourself as its called by #lex after lexing a given
|
|
# String.
|
|
#
|
|
def reset
|
|
@line = 1
|
|
@elements = []
|
|
|
|
@data.rewind if @data.respond_to?(:rewind)
|
|
|
|
reset_native
|
|
end
|
|
|
|
##
|
|
# Yields the data to lex to the supplied block.
|
|
#
|
|
# @return [String]
|
|
# @yieldparam [String]
|
|
#
|
|
def read_data
|
|
if @data.is_a?(String)
|
|
yield @data
|
|
|
|
# IO, StringIO, etc
|
|
# THINK: read(N) would be nice, but currently this screws up the C code
|
|
elsif @data.respond_to?(:each_line)
|
|
@data.each_line { |line| yield line }
|
|
|
|
# Enumerator, Array, etc
|
|
elsif @data.respond_to?(:each)
|
|
@data.each { |chunk| yield chunk }
|
|
end
|
|
end
|
|
|
|
##
|
|
# Gathers all the tokens for the input and returns them as an Array.
|
|
#
|
|
# This method resets the internal state of the lexer after consuming the
|
|
# input.
|
|
#
|
|
# @see #advance
|
|
# @return [Array]
|
|
#
|
|
def lex
|
|
tokens = []
|
|
|
|
advance do |type, value, line|
|
|
tokens << [type, value, line]
|
|
end
|
|
|
|
reset
|
|
|
|
return tokens
|
|
end
|
|
|
|
##
|
|
# Advances through the input and generates the corresponding tokens. Each
|
|
# token is yielded to the supplied block.
|
|
#
|
|
# Each token is an Array in the following format:
|
|
#
|
|
# [TYPE, VALUE]
|
|
#
|
|
# The type is a symbol, the value is either nil or a String.
|
|
#
|
|
# This method stores the supplied block in `@block` and resets it after
|
|
# the lexer loop has finished.
|
|
#
|
|
# This method does *not* reset the internal state of the lexer.
|
|
#
|
|
# @yieldparam [Symbol] type
|
|
# @yieldparam [String] value
|
|
# @yieldparam [Fixnum] line
|
|
#
|
|
def advance(&block)
|
|
@block = block
|
|
|
|
read_data do |chunk|
|
|
advance_native(chunk)
|
|
end
|
|
|
|
# Add any missing closing tags
|
|
unless @elements.empty?
|
|
@elements.length.times { on_element_end }
|
|
end
|
|
ensure
|
|
@block = nil
|
|
end
|
|
|
|
##
|
|
# @return [TrueClass|FalseClass]
|
|
#
|
|
def html?
|
|
return !!html
|
|
end
|
|
|
|
##
|
|
# @return [TrueClass|FalseClass]
|
|
#
|
|
def html_script?
|
|
return html? && current_element == HTML_SCRIPT
|
|
end
|
|
|
|
##
|
|
# @return [TrueClass|FalseClass]
|
|
#
|
|
def html_style?
|
|
return html? && current_element == HTML_STYLE
|
|
end
|
|
|
|
private
|
|
|
|
##
|
|
# @param [Fixnum] amount The amount of lines to advance.
|
|
#
|
|
def advance_line(amount = 1)
|
|
@line += amount
|
|
end
|
|
|
|
##
|
|
# Calls the supplied block with the information of the current token.
|
|
#
|
|
# @param [Symbol] type The token type.
|
|
# @param [String] value The token value.
|
|
#
|
|
# @yieldparam [String] type
|
|
# @yieldparam [String] value
|
|
# @yieldparam [Fixnum] line
|
|
#
|
|
def add_token(type, value = nil)
|
|
@block.call(type, value, @line)
|
|
end
|
|
|
|
##
|
|
# Returns the name of the element we're currently in.
|
|
#
|
|
# @return [String]
|
|
#
|
|
def current_element
|
|
return @elements.last
|
|
end
|
|
|
|
##
|
|
# Called when processing a single quote.
|
|
#
|
|
def on_string_squote
|
|
add_token(:T_STRING_SQUOTE)
|
|
end
|
|
|
|
##
|
|
# Called when processing a double quote.
|
|
#
|
|
def on_string_dquote
|
|
add_token(:T_STRING_DQUOTE)
|
|
end
|
|
|
|
##
|
|
# Called when processing the body of a string.
|
|
#
|
|
# @param [String] value The data between the quotes.
|
|
#
|
|
def on_string_body(value)
|
|
add_token(:T_STRING_BODY, value)
|
|
end
|
|
|
|
##
|
|
# Called when a doctype starts.
|
|
#
|
|
def on_doctype_start
|
|
add_token(:T_DOCTYPE_START)
|
|
end
|
|
|
|
##
|
|
# Called on the identifier specifying the type of the doctype.
|
|
#
|
|
# @param [String] value
|
|
#
|
|
def on_doctype_type(value)
|
|
add_token(:T_DOCTYPE_TYPE, value)
|
|
end
|
|
|
|
##
|
|
# Called on the identifier specifying the name of the doctype.
|
|
#
|
|
# @param [String] value
|
|
#
|
|
def on_doctype_name(value)
|
|
add_token(:T_DOCTYPE_NAME, value)
|
|
end
|
|
|
|
##
|
|
# Called on the end of a doctype.
|
|
#
|
|
def on_doctype_end
|
|
add_token(:T_DOCTYPE_END)
|
|
end
|
|
|
|
##
|
|
# Called on an inline doctype block.
|
|
#
|
|
# @param [String] value
|
|
#
|
|
def on_doctype_inline(value)
|
|
add_token(:T_DOCTYPE_INLINE, value)
|
|
end
|
|
|
|
##
|
|
# Called on the open CDATA tag.
|
|
#
|
|
def on_cdata_start
|
|
add_token(:T_CDATA_START)
|
|
end
|
|
|
|
##
|
|
# Called on the closing CDATA tag.
|
|
#
|
|
def on_cdata_end
|
|
add_token(:T_CDATA_END)
|
|
end
|
|
|
|
##
|
|
# Called for the body of a CDATA tag.
|
|
#
|
|
# @param [String] value
|
|
#
|
|
def on_cdata_body(value)
|
|
add_token(:T_CDATA_BODY, value)
|
|
end
|
|
|
|
##
|
|
# Called on the open comment tag.
|
|
#
|
|
def on_comment_start
|
|
add_token(:T_COMMENT_START)
|
|
end
|
|
|
|
##
|
|
# Called on the closing comment tag.
|
|
#
|
|
def on_comment_end
|
|
add_token(:T_COMMENT_END)
|
|
end
|
|
|
|
##
|
|
# Called on a comment.
|
|
#
|
|
# @param [String] value
|
|
#
|
|
def on_comment_body(value)
|
|
add_token(:T_COMMENT_BODY, value)
|
|
end
|
|
|
|
##
|
|
# Called on the start of an XML declaration tag.
|
|
#
|
|
def on_xml_decl_start
|
|
add_token(:T_XML_DECL_START)
|
|
end
|
|
|
|
##
|
|
# Called on the end of an XML declaration tag.
|
|
#
|
|
def on_xml_decl_end
|
|
add_token(:T_XML_DECL_END)
|
|
end
|
|
|
|
##
|
|
# Called on the start of a processing instruction.
|
|
#
|
|
def on_proc_ins_start
|
|
add_token(:T_PROC_INS_START)
|
|
end
|
|
|
|
##
|
|
# Called on a processing instruction name.
|
|
#
|
|
# @param [String] value
|
|
#
|
|
def on_proc_ins_name(value)
|
|
add_token(:T_PROC_INS_NAME, value)
|
|
end
|
|
|
|
##
|
|
# Called on the body of a processing instruction.
|
|
#
|
|
# @param [String] value
|
|
#
|
|
def on_proc_ins_body(value)
|
|
add_token(:T_PROC_INS_BODY, value)
|
|
end
|
|
|
|
##
|
|
# Called on the end of a processing instruction.
|
|
#
|
|
def on_proc_ins_end
|
|
add_token(:T_PROC_INS_END)
|
|
end
|
|
|
|
##
|
|
# Called on the name of an element.
|
|
#
|
|
# @param [String] name The name of the element, including namespace.
|
|
#
|
|
def on_element_name(name)
|
|
before_html_element_name(name) if html?
|
|
|
|
add_element(name)
|
|
end
|
|
|
|
##
|
|
# Handles inserting of any missing tags whenever a new HTML tag is opened.
|
|
#
|
|
# @param [String] name
|
|
#
|
|
def before_html_element_name(name)
|
|
close_current = HTML_CLOSE_SELF[current_element]
|
|
|
|
if close_current and close_current.include?(name)
|
|
on_element_end
|
|
end
|
|
|
|
# Close remaining parent elements. This for example ensures that a
|
|
# "<tbody>" not only closes an unclosed "<th>" but also the surrounding,
|
|
# unclosed "<tr>".
|
|
while close_current = HTML_CLOSE_SELF[current_element]
|
|
if close_current.include?(name)
|
|
on_element_end
|
|
else
|
|
break
|
|
end
|
|
end
|
|
end
|
|
|
|
##
|
|
# @param [String] name
|
|
#
|
|
def add_element(name)
|
|
@elements << name
|
|
|
|
add_token(:T_ELEM_NAME, name)
|
|
end
|
|
|
|
##
|
|
# Called on the element namespace.
|
|
#
|
|
# @param [String] namespace
|
|
#
|
|
def on_element_ns(namespace)
|
|
add_token(:T_ELEM_NS, namespace)
|
|
end
|
|
|
|
##
|
|
# Called on the closing `>` of the open tag of an element.
|
|
#
|
|
def on_element_open_end
|
|
return unless html?
|
|
|
|
# Only downcase the name if we can't find an all lower/upper version of
|
|
# the element name. This can save us a *lot* of String allocations.
|
|
if HTML_VOID_ELEMENTS.allow?(current_element) \
|
|
or HTML_VOID_ELEMENTS.allow?(current_element.downcase)
|
|
add_token(:T_ELEM_END)
|
|
@elements.pop
|
|
end
|
|
end
|
|
|
|
##
|
|
# Called on the closing tag of an element.
|
|
#
|
|
def on_element_end
|
|
return if @elements.empty?
|
|
|
|
add_token(:T_ELEM_END)
|
|
|
|
@elements.pop
|
|
end
|
|
|
|
##
|
|
# Called on regular text values.
|
|
#
|
|
# @param [String] value
|
|
#
|
|
def on_text(value)
|
|
return if value.empty?
|
|
|
|
add_token(:T_TEXT, value)
|
|
end
|
|
|
|
##
|
|
# Called on attribute namespaces.
|
|
#
|
|
# @param [String] value
|
|
#
|
|
def on_attribute_ns(value)
|
|
add_token(:T_ATTR_NS, value)
|
|
end
|
|
|
|
##
|
|
# Called on tag attributes.
|
|
#
|
|
# @param [String] value
|
|
#
|
|
def on_attribute(value)
|
|
add_token(:T_ATTR, value)
|
|
end
|
|
end # Lexer
|
|
end # XML
|
|
end # Oga
|