From 71aefb53cc249510cd89bfec99488a263c6610c1 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Fri, 13 Feb 2015 10:53:34 +0100 Subject: [PATCH] Started porting the XML parser to ruby-ll This is far from done. --- lib/oga/xml/parser.rll | 136 +++++++++++++++++++++++++++++++++++++++++ oga.gemspec | 1 + 2 files changed, 137 insertions(+) create mode 100644 lib/oga/xml/parser.rll diff --git a/lib/oga/xml/parser.rll b/lib/oga/xml/parser.rll new file mode 100644 index 0000000..305f07f --- /dev/null +++ b/lib/oga/xml/parser.rll @@ -0,0 +1,136 @@ +%name Oga::XML::Parser; + +%terminals T_TEXT T_STRING_SQUOTE T_STRING_DQUOTE T_STRING_BODY; +%terminals T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME; +%terminals T_DOCTYPE_INLINE T_CDATA T_COMMENT; +%terminals T_ELEM_START T_ELEM_NAME T_ELEM_NS T_ELEM_END T_ATTR T_ATTR_NS; +%terminals T_XML_DECL_START T_XML_DECL_END; +%terminals T_PROC_INS_START T_PROC_INS_NAME T_PROC_INS_END; + +document + = expressions + | _ + ; + +expressions + = expression expressions + | _ + ; + +expression + = doctype + ; + +# Doctypes +# +# This parses the following: +# +# +# +# +# +# +# +doctype + = T_DOCTYPE_START T_DOCTYPE_NAME doctype_follow + { + val[2] + } + ; + +# Returns: [T_DOCTYPE_TYPE, string, string, doctype_inline] +doctype_follow + = T_DOCTYPE_END { [] } + | T_DOCTYPE_TYPE doctype_types { [val[0], *val[1]] } + | doctype_inline T_DOCTYPE_END { [nil, nil, nil, val[0]] } + ; + +doctype_inline + = T_DOCTYPE_INLINE doctype_inline_follow { val[0] + val[1] } + ; + +doctype_inline_follow + = doctype_inline { val[0] } + | _ { '' } + ; + +doctype_types + = string doctype_types_follow { [val[0], val[1]] } + ; + +doctype_types_follow + = string T_DOCTYPE_END { val[0] } + | T_DOCTYPE_END { nil } + ; + +# Strings + +string + = T_STRING_DQUOTE string_dquote_follow { val[1] } + | T_STRING_SQUOTE string_squote_follow { val[1] } + ; + +string_dquote_follow + = T_STRING_DQUOTE { '' } + | string_body T_STRING_DQUOTE { val[0] } + ; + +string_squote_follow + = T_STRING_SQUOTE { '' } + | string_body T_STRING_SQUOTE { val[0] } + ; + +string_body + = T_STRING_BODY string_body_follow { val[0] + val[1] } + ; + +string_body_follow + = T_STRING_BODY string_body { val[0] + val[1] } + | _ { '' } + ; + +%inner +{ + ## + # @param [String|IO] data The input to parse. + # @param [Hash] options + # @see [Oga::XML::Lexer#initialize] + # + def initialize(data, options = {}) + @data = data + @lexer = Lexer.new(data, options) + + reset + end + + ## + # Resets the internal state of the parser. + # + def reset + @line = 1 + + @lexer.reset + end + + ## + # Yields the next token from the lexer. + # + # @yieldparam [Array] + # + def each_token + @lexer.advance do |type, value, line| + @line = line if line + + yield [type, value] + end + + yield [-1, -1] + end + + ## + # @param [Hash] options + # + def on_doctype(options = {}) + return Doctype.new(options) + end +} diff --git a/oga.gemspec b/oga.gemspec index c73556b..cfe01ff 100644 --- a/oga.gemspec +++ b/oga.gemspec @@ -34,6 +34,7 @@ Gem::Specification.new do |s| s.add_dependency 'racc', ['~> 1.4', '>= 1.4.12'] s.add_dependency 'ast' + s.add_dependency 'ruby-ll' s.add_development_dependency 'rake' s.add_development_dependency 'rspec', ['~> 3.0']