403 lines
8.7 KiB
Plaintext
403 lines
8.7 KiB
Plaintext
%header
|
|
{
|
|
# DOM parser for both XML and HTML.
|
|
#
|
|
# This parser does not produce a dedicated AST, instead it emits XML nodes
|
|
# directly. Basic usage of this parser is as following:
|
|
#
|
|
# parser = Oga::XML::Parser.new('<foo></foo>')
|
|
# document = parser.parse
|
|
#
|
|
# To enable HTML parsing you'd use the following instead:
|
|
#
|
|
# parser = Oga::XML::Parser.new('<foo></foo>', :html => true)
|
|
# document = parser.parse
|
|
#
|
|
# In both cases you can use either a String or an IO as the parser input. IO
|
|
# instances will result in lower memory overhead, especially when parsing large
|
|
# files.
|
|
#
|
|
}
|
|
|
|
%name Oga::XML::Parser;
|
|
|
|
%terminals T_TEXT T_STRING_SQUOTE T_STRING_DQUOTE T_STRING_BODY;
|
|
%terminals T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME;
|
|
%terminals T_DOCTYPE_INLINE;
|
|
%terminals T_COMMENT_START T_COMMENT_BODY T_COMMENT_END;
|
|
%terminals T_CDATA_START T_CDATA_BODY T_CDATA_END;
|
|
%terminals T_ELEM_NAME T_ELEM_NS T_ELEM_END T_ATTR T_ATTR_NS;
|
|
%terminals T_XML_DECL_START T_XML_DECL_END;
|
|
%terminals T_PROC_INS_START T_PROC_INS_NAME T_PROC_INS_BODY T_PROC_INS_END;
|
|
|
|
document
|
|
= expressions { on_document(val[0]) }
|
|
;
|
|
|
|
expressions
|
|
= expression*
|
|
;
|
|
|
|
expression
|
|
= doctype
|
|
| cdata
|
|
| comment
|
|
| proc_ins
|
|
| text
|
|
| element
|
|
| xml_decl
|
|
;
|
|
|
|
# Doctypes
|
|
#
|
|
# This parses the following:
|
|
#
|
|
# <!DOCTYPE html>
|
|
# <!DOCTYPE html PUBLIC>
|
|
# <!DOCTYPE html PUBLIC "foo">
|
|
# <!DOCTYPE html PUBLIC "foo" "bar">
|
|
# <!DOCTYPE html PUBLIC "foo" "bar">
|
|
# <!DOCTYPE html [ ... ]>
|
|
|
|
doctype
|
|
= T_DOCTYPE_START T_DOCTYPE_NAME doctype_follow
|
|
{
|
|
name = val[1]
|
|
follow = val[2]
|
|
|
|
on_doctype(
|
|
:name => name,
|
|
:type => follow[0],
|
|
:public_id => follow[1],
|
|
:system_id => follow[2],
|
|
:inline_rules => follow[3]
|
|
)
|
|
}
|
|
;
|
|
|
|
# Returns: [T_DOCTYPE_TYPE, string, string, doctype_inline]
|
|
doctype_follow
|
|
= T_DOCTYPE_END { [] }
|
|
| T_DOCTYPE_TYPE doctype_types { [val[0], *val[1]] }
|
|
| doctype_inline T_DOCTYPE_END { [nil, nil, nil, val[0]] }
|
|
;
|
|
|
|
doctype_inline
|
|
= T_DOCTYPE_INLINE+ { val[0].inject(:+) }
|
|
;
|
|
|
|
doctype_types
|
|
= string string? T_DOCTYPE_END { [val[0], val[1]] }
|
|
| T_DOCTYPE_END { nil }
|
|
;
|
|
|
|
# CDATA tags
|
|
|
|
cdata
|
|
= T_CDATA_START cdata_body T_CDATA_END { on_cdata(val[1]) }
|
|
;
|
|
|
|
cdata_body
|
|
= T_CDATA_BODY cdata_body { val[0] + val[1] }
|
|
| _ { '' }
|
|
;
|
|
|
|
# Comments
|
|
|
|
comment
|
|
= T_COMMENT_START comment_body T_COMMENT_END { on_comment(val[1]) }
|
|
;
|
|
|
|
comment_body
|
|
= T_COMMENT_BODY comment_body { val[0] + val[1] }
|
|
| _ { '' }
|
|
;
|
|
|
|
# Processing Instructions
|
|
|
|
proc_ins
|
|
= T_PROC_INS_START T_PROC_INS_NAME proc_ins_body T_PROC_INS_END
|
|
{
|
|
on_proc_ins(val[1], val[2])
|
|
}
|
|
;
|
|
|
|
proc_ins_body
|
|
= T_PROC_INS_BODY proc_ins_body { val[0] + val[1] }
|
|
| _ { '' }
|
|
;
|
|
|
|
# Elements
|
|
|
|
element_name_ns
|
|
= T_ELEM_NAME { [nil, val[0]] }
|
|
| T_ELEM_NS T_ELEM_NAME { val }
|
|
;
|
|
|
|
element_start
|
|
= element_name_ns attributes
|
|
{
|
|
on_element(val[0][0], val[0][1], val[1])
|
|
}
|
|
;
|
|
|
|
element
|
|
= element_start expressions T_ELEM_END
|
|
{
|
|
if val[0]
|
|
on_element_children(val[0], val[1])
|
|
end
|
|
|
|
after_element(val[0])
|
|
}
|
|
;
|
|
|
|
# Attributes
|
|
|
|
attributes
|
|
= attribute* { on_attributes(val[0]) }
|
|
;
|
|
|
|
attribute
|
|
# x:foo="bar"
|
|
= T_ATTR_NS T_ATTR string? { on_attribute(val[1], val[0], val[2]) }
|
|
|
|
# foo="bar"
|
|
| T_ATTR string? { on_attribute(val[0], nil, val[1]) }
|
|
;
|
|
|
|
# XML declarations
|
|
|
|
xml_decl
|
|
= T_XML_DECL_START attributes T_XML_DECL_END { on_xml_decl(val[1]) }
|
|
;
|
|
|
|
# Plain text
|
|
|
|
text
|
|
= T_TEXT text_follow
|
|
{
|
|
text = val[1] ? val[0] + val[1] : val[0]
|
|
|
|
on_text(text)
|
|
}
|
|
;
|
|
|
|
text_follow
|
|
= T_TEXT text_follow { val[1] ? val[0] + val[1] : val[0] }
|
|
| _ { nil }
|
|
;
|
|
|
|
# Strings
|
|
#
|
|
# This parses both (empty) single and double quoted strings.
|
|
|
|
string
|
|
= T_STRING_DQUOTE string_body T_STRING_DQUOTE { val[1] }
|
|
| T_STRING_SQUOTE string_body T_STRING_SQUOTE { val[1] }
|
|
;
|
|
|
|
string_body
|
|
= T_STRING_BODY string_body { val[0] + val[1] }
|
|
| _ { '' }
|
|
;
|
|
|
|
%inner
|
|
{
|
|
# Hash mapping token types and dedicated error labels.
|
|
#
|
|
# @return [Hash]
|
|
TOKEN_ERROR_MAPPING = {
|
|
:T_STRING => 'string',
|
|
:T_TEXT => 'text',
|
|
:T_DOCTYPE_START => 'doctype start',
|
|
:T_DOCTYPE_END => 'doctype closing tag',
|
|
:T_DOCTYPE_TYPE => 'doctype type',
|
|
:T_DOCTYPE_NAME => 'doctype name',
|
|
:T_DOCTYPE_INLINE => 'inline doctype rules',
|
|
:T_CDATA => 'CDATA',
|
|
:T_COMMENT => 'comment',
|
|
:T_ELEM_START => 'element start',
|
|
:T_ELEM_NAME => 'element name',
|
|
:T_ELEM_NS => 'element namespace',
|
|
:T_ELEM_END => 'element closing tag',
|
|
:T_ATTR => 'attribute',
|
|
:T_ATTR_NS => 'attribute namespace',
|
|
:T_XML_DECL_START => 'XML declaration start',
|
|
:T_XML_DECL_END => 'XML declaration end',
|
|
:T_PROC_INS_START => 'processing-instruction start',
|
|
:T_PROC_INS_NAME => 'processing-instruction name',
|
|
:T_PROC_INS_END => 'processing-instruction closing tag',
|
|
-1 => 'end of input'
|
|
}
|
|
|
|
# @param [String|IO] data The input to parse.
|
|
# @param [Hash] options
|
|
# @see [Oga::XML::Lexer#initialize]
|
|
def initialize(data, options = {})
|
|
@data = data
|
|
@lexer = Lexer.new(data, options)
|
|
|
|
reset
|
|
end
|
|
|
|
# Resets the internal state of the parser.
|
|
def reset
|
|
@line = 1
|
|
|
|
@lexer.reset
|
|
end
|
|
|
|
# Yields the next token from the lexer.
|
|
#
|
|
# @yieldparam [Array]
|
|
def each_token
|
|
@lexer.advance do |type, value, line|
|
|
@line = line if line
|
|
|
|
yield [type, value]
|
|
end
|
|
|
|
yield [-1, -1]
|
|
end
|
|
|
|
# @param [Fixnum] stack_type
|
|
# @param [Fixnum] stack_value
|
|
# @param [Symbol] token_type
|
|
# @param [String] token_value
|
|
def parser_error(stack_type, stack_value, token_type, token_value)
|
|
case id_to_type(stack_type)
|
|
when :rule
|
|
message = "Unexpected #{token_type} for rule #{stack_value}"
|
|
when :terminal
|
|
expected = id_to_terminal(stack_value)
|
|
expected = TOKEN_ERROR_MAPPING[expected] || expected
|
|
got = TOKEN_ERROR_MAPPING[token_type] || token_type
|
|
message = "Unexpected #{got}, expected #{expected} instead"
|
|
when :eof
|
|
message = 'Unexpected end of input'
|
|
end
|
|
|
|
message += " on line #{@line}"
|
|
|
|
raise LL::ParserError, message
|
|
end
|
|
|
|
# @see [LL::Driver#parse]
|
|
def parse
|
|
retval = super
|
|
|
|
reset
|
|
|
|
retval
|
|
end
|
|
|
|
# @param [Array] children
|
|
# @return [Oga::XML::Document]
|
|
def on_document(children = [])
|
|
document = Document.new(:type => @lexer.html? ? :html : :xml)
|
|
|
|
children.each do |child|
|
|
if child.is_a?(Doctype)
|
|
document.doctype = child
|
|
|
|
elsif child.is_a?(XmlDeclaration)
|
|
document.xml_declaration = child
|
|
|
|
else
|
|
document.children << child
|
|
end
|
|
end
|
|
|
|
document
|
|
end
|
|
|
|
# @param [Hash] options
|
|
def on_doctype(options = {})
|
|
Doctype.new(options)
|
|
end
|
|
|
|
# @param [String] text
|
|
# @return [Oga::XML::Cdata]
|
|
def on_cdata(text = nil)
|
|
Cdata.new(:text => text)
|
|
end
|
|
|
|
# @param [String] text
|
|
# @return [Oga::XML::Comment]
|
|
def on_comment(text = nil)
|
|
Comment.new(:text => text)
|
|
end
|
|
|
|
# @param [String] name
|
|
# @param [String] text
|
|
# @return [Oga::XML::ProcessingInstruction]
|
|
def on_proc_ins(name, text = nil)
|
|
ProcessingInstruction.new(:name => name, :text => text)
|
|
end
|
|
|
|
# @param [Array] attributes
|
|
# @return [Oga::XML::XmlDeclaration]
|
|
def on_xml_decl(attributes = [])
|
|
options = {}
|
|
|
|
attributes.each do |attr|
|
|
options[attr.name.to_sym] = attr.value
|
|
end
|
|
|
|
XmlDeclaration.new(options)
|
|
end
|
|
|
|
# @param [String] text
|
|
# @return [Oga::XML::Text]
|
|
def on_text(text)
|
|
Text.new(:text => text)
|
|
end
|
|
|
|
# @param [String] namespace
|
|
# @param [String] name
|
|
# @param [Hash] attributes
|
|
# @return [Oga::XML::Element]
|
|
def on_element(namespace, name, attributes = {})
|
|
element = Element.new(
|
|
:namespace_name => namespace,
|
|
:name => name,
|
|
:attributes => attributes
|
|
)
|
|
|
|
element
|
|
end
|
|
|
|
# @param [Oga::XML::Element] element
|
|
# @param [Array] children
|
|
# @return [Oga::XML::Element]
|
|
def on_element_children(element, children = [])
|
|
element.children = children
|
|
|
|
element
|
|
end
|
|
|
|
# @param [Oga::XML::Element] element
|
|
# @return [Oga::XML::Element]
|
|
def after_element(element)
|
|
element
|
|
end
|
|
|
|
# @param [String] name
|
|
# @param [String] ns_name
|
|
# @param [String] value
|
|
# @return [Oga::XML::Attribute]
|
|
def on_attribute(name, ns_name = nil, value = nil)
|
|
Attribute.new(
|
|
:namespace_name => ns_name,
|
|
:name => name,
|
|
:value => value
|
|
)
|
|
end
|
|
|
|
# @param [Array] attrs
|
|
def on_attributes(attrs)
|
|
attrs
|
|
end
|
|
}
|