Namespaced the lexer/parser under Oga::XML.
With the upcoming XPath and CSS selector lexers/parsers it will be confusing to keep these in the root namespace.
This commit is contained in:
		
							parent
							
								
									2259061c89
								
							
						
					
					
						commit
						eae13d21ed
					
				
							
								
								
									
										6
									
								
								Rakefile
								
								
								
								
							
							
						
						
									
										6
									
								
								Rakefile
								
								
								
								
							|  | @ -5,10 +5,10 @@ require 'cliver' | |||
| 
 | ||||
| GEMSPEC = Gem::Specification.load('oga.gemspec') | ||||
| 
 | ||||
| LEXER_INPUT  = 'lib/oga/lexer.rl' | ||||
| LEXER_OUTPUT = 'lib/oga/lexer.rb' | ||||
| LEXER_INPUT  = 'lib/oga/xml/lexer.rl' | ||||
| LEXER_OUTPUT = 'lib/oga/xml/lexer.rb' | ||||
| 
 | ||||
| HTML_PARSER = 'lib/oga/parser.rb' | ||||
| HTML_PARSER = 'lib/oga/xml/parser.rb' | ||||
| 
 | ||||
| GENERATED_FILES = ['coverage', 'yardoc', LEXER_OUTPUT, HTML_PARSER] | ||||
| 
 | ||||
|  |  | |||
|  | @ -5,7 +5,7 @@ string = 'Hello, how are you doing today?' | |||
| small  = "<![CDATA[#{string}]]>" | ||||
| medium = "<![CDATA[#{string * 1_000}]]>" | ||||
| large  = "<![CDATA[#{string * 10_000}]]>" | ||||
| lexer  = Oga::Lexer.new | ||||
| lexer  = Oga::XML::Lexer.new | ||||
| 
 | ||||
| Benchmark.ips do |bench| | ||||
|   bench.report 'CDATA with a small body' do | ||||
|  |  | |||
|  | @ -4,7 +4,7 @@ require 'benchmark/ips' | |||
| simple     = '<p>Hello world</p>' | ||||
| attributes = '<p class="foo">Hello world</p>' | ||||
| nested     = '<p>Hello<strong>world</strong></p>' | ||||
| lexer      = Oga::Lexer.new | ||||
| lexer      = Oga::XML::Lexer.new | ||||
| 
 | ||||
| Benchmark.ips do |bench| | ||||
|   bench.report 'text only' do | ||||
|  |  | |||
|  | @ -2,7 +2,7 @@ require_relative '../../lib/oga' | |||
| require 'benchmark/ips' | ||||
| 
 | ||||
| html  = File.read(File.expand_path('../../fixtures/hrs.html', __FILE__)) | ||||
| lexer = Oga::Lexer.new(:html => true) | ||||
| lexer = Oga::XML::Lexer.new(:html => true) | ||||
| 
 | ||||
| Benchmark.ips do |bench| | ||||
|   bench.report 'lex HTML' do | ||||
|  |  | |||
|  | @ -2,7 +2,7 @@ require_relative '../../lib/oga' | |||
| require 'benchmark' | ||||
| 
 | ||||
| html  = File.read(File.expand_path('../../fixtures/hrs.html', __FILE__)) | ||||
| lexer = Oga::Lexer.new(:html => true) | ||||
| lexer = Oga::XML::Lexer.new(:html => true) | ||||
| 
 | ||||
| Benchmark.bmbm(20) do |bench| | ||||
|   bench.report 'lex HTML' do | ||||
|  |  | |||
|  | @ -1,5 +1,5 @@ | |||
| require 'ast' | ||||
| 
 | ||||
| require_relative 'oga/ast/node' | ||||
| require_relative 'oga/lexer' | ||||
| require_relative 'oga/parser' | ||||
| require_relative 'oga/xml/lexer' | ||||
| require_relative 'oga/xml/parser' | ||||
|  |  | |||
							
								
								
									
										508
									
								
								lib/oga/lexer.rl
								
								
								
								
							
							
						
						
									
										508
									
								
								lib/oga/lexer.rl
								
								
								
								
							|  | @ -1,508 +0,0 @@ | |||
| %%machine lexer; # % | ||||
| 
 | ||||
| module Oga | ||||
|   ## | ||||
|   # Low level lexer that supports both XML and HTML (using an extra option). To | ||||
|   # lex HTML input set the `:html` option to `true` when creating an instance | ||||
|   # of the lexer: | ||||
|   # | ||||
|   #     lexer = Oga::Lexer.new(:html => true) | ||||
|   # | ||||
|   # @!attribute [r] html | ||||
|   #  @return [TrueClass|FalseClass] | ||||
|   # | ||||
|   class Lexer | ||||
|     %% write data; # % | ||||
| 
 | ||||
|     attr_reader :html | ||||
| 
 | ||||
|     ## | ||||
|     # Names of the HTML void elements that should be handled when HTML lexing | ||||
|     # is enabled. | ||||
|     # | ||||
|     # @return [Array] | ||||
|     # | ||||
|     HTML_VOID_ELEMENTS = [ | ||||
|       'area', | ||||
|       'base', | ||||
|       'br', | ||||
|       'col', | ||||
|       'command', | ||||
|       'embed', | ||||
|       'hr', | ||||
|       'img', | ||||
|       'input', | ||||
|       'keygen', | ||||
|       'link', | ||||
|       'meta', | ||||
|       'param', | ||||
|       'source', | ||||
|       'track', | ||||
|       'wbr' | ||||
|     ] | ||||
| 
 | ||||
|     # Lazy way of forwarding instance method calls used internally by Ragel to | ||||
|     # their corresponding class methods. | ||||
|     private_methods.grep(/^_lexer_/).each do |name| | ||||
|       define_method(name) do | ||||
|         return self.class.send(name) | ||||
|       end | ||||
| 
 | ||||
|       private(name) | ||||
|     end | ||||
| 
 | ||||
|     ## | ||||
|     # @param [Hash] options | ||||
|     # | ||||
|     # @option options [Symbol] :html When set to `true` the lexer will treat | ||||
|     #  the input as HTML instead of SGML/XML. This makes it possible to lex | ||||
|     #  HTML void elements such as `<link href="">`. | ||||
|     # | ||||
|     def initialize(options = {}) | ||||
|       options.each do |key, value| | ||||
|         instance_variable_set("@#{key}", value) if respond_to?(key) | ||||
|       end | ||||
| 
 | ||||
|       reset | ||||
|     end | ||||
| 
 | ||||
|     ## | ||||
|     # Resets the internal state of the lexer. Typically you don't need to call | ||||
|     # this method yourself as its called by #lex after lexing a given String. | ||||
|     # | ||||
|     def reset | ||||
|       @line     = 1 | ||||
|       @data     = nil | ||||
|       @ts       = nil | ||||
|       @te       = nil | ||||
|       @tokens   = [] | ||||
|       @stack    = [] | ||||
|       @top      = 0 | ||||
|       @elements = [] | ||||
| 
 | ||||
|       @buffer_start_position = nil | ||||
|     end | ||||
| 
 | ||||
|     ## | ||||
|     # Lexes the supplied String and returns an Array of tokens. Each token is | ||||
|     # an Array in the following format: | ||||
|     # | ||||
|     #     [TYPE, VALUE] | ||||
|     # | ||||
|     # The type is a symbol, the value is either nil or a String. | ||||
|     # | ||||
|     # @param [String] data The string to lex. | ||||
|     # @return [Array] | ||||
|     # | ||||
|     def lex(data) | ||||
|       @data       = data.unpack('U*') | ||||
|       lexer_start = self.class.lexer_start | ||||
|       eof         = data.length | ||||
| 
 | ||||
|       %% write init; | ||||
|       %% write exec; | ||||
| 
 | ||||
|       tokens = @tokens | ||||
| 
 | ||||
|       reset | ||||
| 
 | ||||
|       return tokens | ||||
|     end | ||||
| 
 | ||||
|     ## | ||||
|     # @return [TrueClass|FalseClass] | ||||
|     # | ||||
|     def html? | ||||
|       return !!html | ||||
|     end | ||||
| 
 | ||||
|     private | ||||
| 
 | ||||
|     ## | ||||
|     # @param [Fixnum] amount The amount of lines to advance. | ||||
|     # | ||||
|     def advance_line(amount = 1) | ||||
|       @line += amount | ||||
|     end | ||||
| 
 | ||||
|     ## | ||||
|     # Emits a token who's value is based on the supplied start/stop position. | ||||
|     # | ||||
|     # @param [Symbol] type The token type. | ||||
|     # @param [Fixnum] start | ||||
|     # @param [Fixnum] stop | ||||
|     # | ||||
|     # @see #text | ||||
|     # @see #add_token | ||||
|     # | ||||
|     def t(type, start = @ts, stop = @te) | ||||
|       value = text(start, stop) | ||||
| 
 | ||||
|       add_token(type, value) | ||||
|     end | ||||
| 
 | ||||
|     ## | ||||
|     # Returns the text of the current buffer based on the supplied start and | ||||
|     # stop position. | ||||
|     # | ||||
|     # By default `@ts` and `@te` are used as the start/stop position. | ||||
|     # | ||||
|     # @param [Fixnum] start | ||||
|     # @param [Fixnum] stop | ||||
|     # @return [String] | ||||
|     # | ||||
|     def text(start = @ts, stop = @te) | ||||
|       return @data[start...stop].pack('U*') | ||||
|     end | ||||
| 
 | ||||
|     ## | ||||
|     # Adds a token with the given type and value to the list. | ||||
|     # | ||||
|     # @param [Symbol] type The token type. | ||||
|     # @param [String] value The token value. | ||||
|     # | ||||
|     def add_token(type, value = nil) | ||||
|       token = [type, value, @line] | ||||
| 
 | ||||
|       @tokens << token | ||||
|     end | ||||
| 
 | ||||
|     ## | ||||
|     # Enables buffering starting at the given position. | ||||
|     # | ||||
|     # @param [Fixnum] position The start position of the buffer, set to `@te` | ||||
|     #  by default. | ||||
|     # | ||||
|     def start_buffer(position = @te) | ||||
|       @buffer_start_position = position | ||||
|     end | ||||
| 
 | ||||
|     ## | ||||
|     # Returns `true` if we're currently buffering. | ||||
|     # | ||||
|     # @return [TrueClass|FalseClass] | ||||
|     # | ||||
|     def buffering? | ||||
|       return !!@buffer_start_position | ||||
|     end | ||||
| 
 | ||||
|     ## | ||||
|     # Emits the current buffer if we have any. The current line number is | ||||
|     # advanced based on the amount of newlines in the buffer. | ||||
|     # | ||||
|     # @param [Fixnum] position The end position of the buffer, set to `@ts` by | ||||
|     #  default. | ||||
|     # | ||||
|     # @param [Symbol] type The type of node to emit. | ||||
|     # | ||||
|     def emit_buffer(position = @ts, type = :T_TEXT) | ||||
|       return unless @buffer_start_position | ||||
| 
 | ||||
|       content = text(@buffer_start_position, position) | ||||
| 
 | ||||
|       unless content.empty? | ||||
|         add_token(type, content) | ||||
| 
 | ||||
|         lines = content.count("\n") | ||||
| 
 | ||||
|         advance_line(lines) if lines > 0 | ||||
|       end | ||||
| 
 | ||||
|       @buffer_start_position = nil | ||||
|     end | ||||
| 
 | ||||
|     ## | ||||
|     # Returns the name of the element we're currently in. | ||||
|     # | ||||
|     # @return [String] | ||||
|     # | ||||
|     def current_element | ||||
|       return @elements.last | ||||
|     end | ||||
| 
 | ||||
|     %%{ | ||||
|       # Use instance variables for `ts` and friends. | ||||
|       access @; | ||||
|       getkey (@data[p] || 0); | ||||
| 
 | ||||
|       newline    = '\n' | '\r\n'; | ||||
|       whitespace = [ \t]; | ||||
| 
 | ||||
|       # Strings | ||||
|       # | ||||
|       # Strings in HTML can either be single or double quoted. If a string | ||||
|       # starts with one of these quotes it must be closed with the same type of | ||||
|       # quote. | ||||
|       dquote = '"'; | ||||
|       squote = "'"; | ||||
| 
 | ||||
|       action start_string_dquote { | ||||
|         start_buffer | ||||
| 
 | ||||
|         fcall string_dquote; | ||||
|       } | ||||
| 
 | ||||
|       action start_string_squote { | ||||
|         start_buffer | ||||
| 
 | ||||
|         fcall string_squote; | ||||
|       } | ||||
| 
 | ||||
|       # Machine for processing double quoted strings. | ||||
|       string_dquote := |* | ||||
|         dquote => { | ||||
|           emit_buffer(@ts, :T_STRING) | ||||
|           fret; | ||||
|         }; | ||||
| 
 | ||||
|         any; | ||||
|       *|; | ||||
| 
 | ||||
|       # Machine for processing single quoted strings. | ||||
|       string_squote := |* | ||||
|         squote => { | ||||
|           emit_buffer(@ts, :T_STRING) | ||||
|           fret; | ||||
|         }; | ||||
| 
 | ||||
|         any; | ||||
|       *|; | ||||
| 
 | ||||
|       # DOCTYPES | ||||
|       # | ||||
|       # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax | ||||
|       # | ||||
|       # These rules support the 3 flavours of doctypes: | ||||
|       # | ||||
|       # 1. Normal doctypes, as introduced in the HTML5 specification. | ||||
|       # 2. Deprecated doctypes, the more verbose ones used prior to HTML5. | ||||
|       # 3. Legacy doctypes | ||||
|       # | ||||
|       doctype_start = '<!DOCTYPE'i whitespace+ 'HTML'i; | ||||
| 
 | ||||
|       action start_doctype { | ||||
|         emit_buffer | ||||
|         add_token(:T_DOCTYPE_START) | ||||
|         fcall doctype; | ||||
|       } | ||||
| 
 | ||||
|       # Machine for processing doctypes. Doctype values such as the public and | ||||
|       # system IDs are treated as T_STRING tokens. | ||||
|       doctype := |* | ||||
|         'PUBLIC' | 'SYSTEM' => { t(:T_DOCTYPE_TYPE) }; | ||||
| 
 | ||||
|         # Lex the public/system IDs as regular strings. | ||||
|         dquote => start_string_dquote; | ||||
|         squote => start_string_squote; | ||||
| 
 | ||||
|         # Whitespace inside doctypes is ignored since there's no point in | ||||
|         # including it. | ||||
|         whitespace; | ||||
| 
 | ||||
|         '>' => { | ||||
|           add_token(:T_DOCTYPE_END) | ||||
|           fret; | ||||
|         }; | ||||
|       *|; | ||||
| 
 | ||||
|       # CDATA | ||||
|       # | ||||
|       # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections | ||||
|       # | ||||
|       # CDATA tags are broken up into 3 parts: the start, the content and the | ||||
|       # end tag. | ||||
|       # | ||||
|       # In HTML CDATA tags have no meaning/are not supported. Oga does | ||||
|       # support them but treats their contents as plain text. | ||||
|       # | ||||
|       cdata_start = '<![CDATA['; | ||||
|       cdata_end   = ']]>'; | ||||
| 
 | ||||
|       action start_cdata { | ||||
|         emit_buffer | ||||
|         add_token(:T_CDATA_START) | ||||
| 
 | ||||
|         start_buffer | ||||
| 
 | ||||
|         fcall cdata; | ||||
|       } | ||||
| 
 | ||||
|       # Machine that for processing the contents of CDATA tags. Everything | ||||
|       # inside a CDATA tag is treated as plain text. | ||||
|       cdata := |* | ||||
|         cdata_end => { | ||||
|           emit_buffer | ||||
|           add_token(:T_CDATA_END) | ||||
| 
 | ||||
|           fret; | ||||
|         }; | ||||
| 
 | ||||
|         any; | ||||
|       *|; | ||||
| 
 | ||||
|       # Comments | ||||
|       # | ||||
|       # http://www.w3.org/TR/html-markup/syntax.html#comments | ||||
|       # | ||||
|       # Comments are lexed into 3 parts: the start tag, the content and the end | ||||
|       # tag. | ||||
|       # | ||||
|       # Unlike the W3 specification these rules *do* allow character sequences | ||||
|       # such as `--` and `->`. Putting extra checks in for these sequences | ||||
|       # would actually make the rules/actions more complex. | ||||
|       # | ||||
|       comment_start = '<!--'; | ||||
|       comment_end   = '-->'; | ||||
| 
 | ||||
|       action start_comment { | ||||
|         emit_buffer | ||||
|         add_token(:T_COMMENT_START) | ||||
| 
 | ||||
|         start_buffer | ||||
| 
 | ||||
|         fcall comment; | ||||
|       } | ||||
| 
 | ||||
|       # Machine used for processing the contents of a comment. Everything | ||||
|       # inside a comment is treated as plain text (similar to CDATA tags). | ||||
|       comment := |* | ||||
|         comment_end => { | ||||
|           emit_buffer | ||||
|           add_token(:T_COMMENT_END) | ||||
| 
 | ||||
|           fret; | ||||
|         }; | ||||
| 
 | ||||
|         any; | ||||
|       *|; | ||||
| 
 | ||||
|       # XML declaration tags | ||||
|       # | ||||
|       # http://www.w3.org/TR/REC-xml/#sec-prolog-dtd | ||||
|       # | ||||
|       xml_decl_start = '<?xml'; | ||||
|       xml_decl_end   = '?>'; | ||||
| 
 | ||||
|       action start_xml_decl { | ||||
|         emit_buffer | ||||
|         add_token(:T_XML_DECL_START) | ||||
| 
 | ||||
|         start_buffer | ||||
| 
 | ||||
|         fcall xml_decl; | ||||
|       } | ||||
| 
 | ||||
|       # Machine that processes the contents of an XML declaration tag. | ||||
|       xml_decl := |* | ||||
|         xml_decl_end => { | ||||
|           emit_buffer | ||||
|           add_token(:T_XML_DECL_END) | ||||
| 
 | ||||
|           fret; | ||||
|         }; | ||||
| 
 | ||||
|         any; | ||||
|       *|; | ||||
| 
 | ||||
|       # Elements | ||||
|       # | ||||
|       # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements | ||||
|       # | ||||
| 
 | ||||
|       # Action that creates the tokens for the opening tag, name and namespace | ||||
|       # (if any). Remaining work is delegated to a dedicated machine. | ||||
|       action start_element { | ||||
|         emit_buffer | ||||
|         add_token(:T_ELEM_START) | ||||
| 
 | ||||
|         # Add the element name. If the name includes a namespace we'll break | ||||
|         # the name up into two separate tokens. | ||||
|         name = text(@ts + 1) | ||||
| 
 | ||||
|         if name.include?(':') | ||||
|           ns, name = name.split(':') | ||||
| 
 | ||||
|           add_token(:T_ELEM_NS, ns) | ||||
|         end | ||||
| 
 | ||||
|         @elements << name | ||||
| 
 | ||||
|         add_token(:T_ELEM_NAME, name) | ||||
| 
 | ||||
|         fcall element_head; | ||||
|       } | ||||
| 
 | ||||
|       element_name  = [a-zA-Z0-9\-_:]+; | ||||
|       element_start = '<' element_name; | ||||
| 
 | ||||
|       # Machine used for processing the characters inside a element head. An | ||||
|       # element head is everything between `<NAME` (where NAME is the element | ||||
|       # name) and `>`. | ||||
|       # | ||||
|       # For example, in `<p foo="bar">` the element head is ` foo="bar"`. | ||||
|       # | ||||
|       element_head := |* | ||||
|         whitespace | '='; | ||||
| 
 | ||||
|         newline => { advance_line }; | ||||
| 
 | ||||
|         # Attribute names. | ||||
|         element_name => { t(:T_ATTR) }; | ||||
| 
 | ||||
|         # Attribute values. | ||||
|         dquote => start_string_dquote; | ||||
|         squote => start_string_squote; | ||||
| 
 | ||||
|         # The closing character of the open tag. | ||||
|         ('>' | '/') => { | ||||
|           fhold; | ||||
|           fret; | ||||
|         }; | ||||
|       *|; | ||||
| 
 | ||||
|       main := |* | ||||
|         element_start  => start_element; | ||||
|         doctype_start  => start_doctype; | ||||
|         cdata_start    => start_cdata; | ||||
|         comment_start  => start_comment; | ||||
|         xml_decl_start => start_xml_decl; | ||||
| 
 | ||||
|         # Enter the body of the tag. If HTML mode is enabled and the current | ||||
|         # element is a void element we'll close it and bail out. | ||||
|         '>' => { | ||||
|           if html? and HTML_VOID_ELEMENTS.include?(current_element) | ||||
|             add_token(:T_ELEM_END, nil) | ||||
|             @elements.pop | ||||
|           end | ||||
|         }; | ||||
| 
 | ||||
|         # Regular closing tags. | ||||
|         '</' element_name '>' => { | ||||
|           emit_buffer | ||||
|           add_token(:T_ELEM_END, nil) | ||||
| 
 | ||||
|           @elements.pop | ||||
|         }; | ||||
| 
 | ||||
|         # Self closing elements that are not handled by the HTML mode. | ||||
|         '/>' => { | ||||
|           add_token(:T_ELEM_END, nil) | ||||
| 
 | ||||
|           @elements.pop | ||||
|         }; | ||||
| 
 | ||||
|         # Note that this rule should be declared at the very bottom as it will | ||||
|         # otherwise take precedence over the other rules. | ||||
|         any => { | ||||
|           # First character, start buffering (unless we already are buffering). | ||||
|           start_buffer(@ts) unless buffering? | ||||
| 
 | ||||
|           # EOF, emit the text buffer. | ||||
|           if @te == eof | ||||
|             emit_buffer(@te) | ||||
|           end | ||||
|         }; | ||||
|       *|; | ||||
|     }%% | ||||
|   end # Lexer | ||||
| end # Oga | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -0,0 +1,510 @@ | |||
| %%machine lexer; # % | ||||
| 
 | ||||
| module Oga | ||||
|   module XML | ||||
|     ## | ||||
|     # Low level lexer that supports both XML and HTML (using an extra option). To | ||||
|     # lex HTML input set the `:html` option to `true` when creating an instance | ||||
|     # of the lexer: | ||||
|     # | ||||
|     #     lexer = Oga::Lexer.new(:html => true) | ||||
|     # | ||||
|     # @!attribute [r] html | ||||
|     #  @return [TrueClass|FalseClass] | ||||
|     # | ||||
|     class Lexer | ||||
|       %% write data; # % | ||||
| 
 | ||||
|       attr_reader :html | ||||
| 
 | ||||
|       ## | ||||
|       # Names of the HTML void elements that should be handled when HTML lexing | ||||
|       # is enabled. | ||||
|       # | ||||
|       # @return [Array] | ||||
|       # | ||||
|       HTML_VOID_ELEMENTS = [ | ||||
|         'area', | ||||
|         'base', | ||||
|         'br', | ||||
|         'col', | ||||
|         'command', | ||||
|         'embed', | ||||
|         'hr', | ||||
|         'img', | ||||
|         'input', | ||||
|         'keygen', | ||||
|         'link', | ||||
|         'meta', | ||||
|         'param', | ||||
|         'source', | ||||
|         'track', | ||||
|         'wbr' | ||||
|       ] | ||||
| 
 | ||||
|       # Lazy way of forwarding instance method calls used internally by Ragel to | ||||
|       # their corresponding class methods. | ||||
|       private_methods.grep(/^_lexer_/).each do |name| | ||||
|         define_method(name) do | ||||
|           return self.class.send(name) | ||||
|         end | ||||
| 
 | ||||
|         private(name) | ||||
|       end | ||||
| 
 | ||||
|       ## | ||||
|       # @param [Hash] options | ||||
|       # | ||||
|       # @option options [Symbol] :html When set to `true` the lexer will treat | ||||
|       #  the input as HTML instead of SGML/XML. This makes it possible to lex | ||||
|       #  HTML void elements such as `<link href="">`. | ||||
|       # | ||||
|       def initialize(options = {}) | ||||
|         options.each do |key, value| | ||||
|           instance_variable_set("@#{key}", value) if respond_to?(key) | ||||
|         end | ||||
| 
 | ||||
|         reset | ||||
|       end | ||||
| 
 | ||||
|       ## | ||||
|       # Resets the internal state of the lexer. Typically you don't need to call | ||||
|       # this method yourself as its called by #lex after lexing a given String. | ||||
|       # | ||||
|       def reset | ||||
|         @line     = 1 | ||||
|         @data     = nil | ||||
|         @ts       = nil | ||||
|         @te       = nil | ||||
|         @tokens   = [] | ||||
|         @stack    = [] | ||||
|         @top      = 0 | ||||
|         @elements = [] | ||||
| 
 | ||||
|         @buffer_start_position = nil | ||||
|       end | ||||
| 
 | ||||
|       ## | ||||
|       # Lexes the supplied String and returns an Array of tokens. Each token is | ||||
|       # an Array in the following format: | ||||
|       # | ||||
|       #     [TYPE, VALUE] | ||||
|       # | ||||
|       # The type is a symbol, the value is either nil or a String. | ||||
|       # | ||||
|       # @param [String] data The string to lex. | ||||
|       # @return [Array] | ||||
|       # | ||||
|       def lex(data) | ||||
|         @data       = data.unpack('U*') | ||||
|         lexer_start = self.class.lexer_start | ||||
|         eof         = data.length | ||||
| 
 | ||||
|         %% write init; | ||||
|         %% write exec; | ||||
| 
 | ||||
|         tokens = @tokens | ||||
| 
 | ||||
|         reset | ||||
| 
 | ||||
|         return tokens | ||||
|       end | ||||
| 
 | ||||
|       ## | ||||
|       # @return [TrueClass|FalseClass] | ||||
|       # | ||||
|       def html? | ||||
|         return !!html | ||||
|       end | ||||
| 
 | ||||
|       private | ||||
| 
 | ||||
|       ## | ||||
|       # @param [Fixnum] amount The amount of lines to advance. | ||||
|       # | ||||
|       def advance_line(amount = 1) | ||||
|         @line += amount | ||||
|       end | ||||
| 
 | ||||
|       ## | ||||
|       # Emits a token who's value is based on the supplied start/stop position. | ||||
|       # | ||||
|       # @param [Symbol] type The token type. | ||||
|       # @param [Fixnum] start | ||||
|       # @param [Fixnum] stop | ||||
|       # | ||||
|       # @see #text | ||||
|       # @see #add_token | ||||
|       # | ||||
|       def t(type, start = @ts, stop = @te) | ||||
|         value = text(start, stop) | ||||
| 
 | ||||
|         add_token(type, value) | ||||
|       end | ||||
| 
 | ||||
|       ## | ||||
|       # Returns the text of the current buffer based on the supplied start and | ||||
|       # stop position. | ||||
|       # | ||||
|       # By default `@ts` and `@te` are used as the start/stop position. | ||||
|       # | ||||
|       # @param [Fixnum] start | ||||
|       # @param [Fixnum] stop | ||||
|       # @return [String] | ||||
|       # | ||||
|       def text(start = @ts, stop = @te) | ||||
|         return @data[start...stop].pack('U*') | ||||
|       end | ||||
| 
 | ||||
|       ## | ||||
|       # Adds a token with the given type and value to the list. | ||||
|       # | ||||
|       # @param [Symbol] type The token type. | ||||
|       # @param [String] value The token value. | ||||
|       # | ||||
|       def add_token(type, value = nil) | ||||
|         token = [type, value, @line] | ||||
| 
 | ||||
|         @tokens << token | ||||
|       end | ||||
| 
 | ||||
|       ## | ||||
|       # Enables buffering starting at the given position. | ||||
|       # | ||||
|       # @param [Fixnum] position The start position of the buffer, set to `@te` | ||||
|       #  by default. | ||||
|       # | ||||
|       def start_buffer(position = @te) | ||||
|         @buffer_start_position = position | ||||
|       end | ||||
| 
 | ||||
|       ## | ||||
|       # Returns `true` if we're currently buffering. | ||||
|       # | ||||
|       # @return [TrueClass|FalseClass] | ||||
|       # | ||||
|       def buffering? | ||||
|         return !!@buffer_start_position | ||||
|       end | ||||
| 
 | ||||
|       ## | ||||
|       # Emits the current buffer if we have any. The current line number is | ||||
|       # advanced based on the amount of newlines in the buffer. | ||||
|       # | ||||
|       # @param [Fixnum] position The end position of the buffer, set to `@ts` by | ||||
|       #  default. | ||||
|       # | ||||
|       # @param [Symbol] type The type of node to emit. | ||||
|       # | ||||
|       def emit_buffer(position = @ts, type = :T_TEXT) | ||||
|         return unless @buffer_start_position | ||||
| 
 | ||||
|         content = text(@buffer_start_position, position) | ||||
| 
 | ||||
|         unless content.empty? | ||||
|           add_token(type, content) | ||||
| 
 | ||||
|           lines = content.count("\n") | ||||
| 
 | ||||
|           advance_line(lines) if lines > 0 | ||||
|         end | ||||
| 
 | ||||
|         @buffer_start_position = nil | ||||
|       end | ||||
| 
 | ||||
|       ## | ||||
|       # Returns the name of the element we're currently in. | ||||
|       # | ||||
|       # @return [String] | ||||
|       # | ||||
|       def current_element | ||||
|         return @elements.last | ||||
|       end | ||||
| 
 | ||||
|       %%{ | ||||
|         # Use instance variables for `ts` and friends. | ||||
|         access @; | ||||
|         getkey (@data[p] || 0); | ||||
| 
 | ||||
|         newline    = '\n' | '\r\n'; | ||||
|         whitespace = [ \t]; | ||||
| 
 | ||||
|         # Strings | ||||
|         # | ||||
|         # Strings in HTML can either be single or double quoted. If a string | ||||
|         # starts with one of these quotes it must be closed with the same type of | ||||
|         # quote. | ||||
|         dquote = '"'; | ||||
|         squote = "'"; | ||||
| 
 | ||||
|         action start_string_dquote { | ||||
|           start_buffer | ||||
| 
 | ||||
|           fcall string_dquote; | ||||
|         } | ||||
| 
 | ||||
|         action start_string_squote { | ||||
|           start_buffer | ||||
| 
 | ||||
|           fcall string_squote; | ||||
|         } | ||||
| 
 | ||||
|         # Machine for processing double quoted strings. | ||||
|         string_dquote := |* | ||||
|           dquote => { | ||||
|             emit_buffer(@ts, :T_STRING) | ||||
|             fret; | ||||
|           }; | ||||
| 
 | ||||
|           any; | ||||
|         *|; | ||||
| 
 | ||||
|         # Machine for processing single quoted strings. | ||||
|         string_squote := |* | ||||
|           squote => { | ||||
|             emit_buffer(@ts, :T_STRING) | ||||
|             fret; | ||||
|           }; | ||||
| 
 | ||||
|           any; | ||||
|         *|; | ||||
| 
 | ||||
|         # DOCTYPES | ||||
|         # | ||||
|         # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax | ||||
|         # | ||||
|         # These rules support the 3 flavours of doctypes: | ||||
|         # | ||||
|         # 1. Normal doctypes, as introduced in the HTML5 specification. | ||||
|         # 2. Deprecated doctypes, the more verbose ones used prior to HTML5. | ||||
|         # 3. Legacy doctypes | ||||
|         # | ||||
|         doctype_start = '<!DOCTYPE'i whitespace+ 'HTML'i; | ||||
| 
 | ||||
|         action start_doctype { | ||||
|           emit_buffer | ||||
|           add_token(:T_DOCTYPE_START) | ||||
|           fcall doctype; | ||||
|         } | ||||
| 
 | ||||
|         # Machine for processing doctypes. Doctype values such as the public and | ||||
|         # system IDs are treated as T_STRING tokens. | ||||
|         doctype := |* | ||||
|           'PUBLIC' | 'SYSTEM' => { t(:T_DOCTYPE_TYPE) }; | ||||
| 
 | ||||
|           # Lex the public/system IDs as regular strings. | ||||
|           dquote => start_string_dquote; | ||||
|           squote => start_string_squote; | ||||
| 
 | ||||
|           # Whitespace inside doctypes is ignored since there's no point in | ||||
|           # including it. | ||||
|           whitespace; | ||||
| 
 | ||||
|           '>' => { | ||||
|             add_token(:T_DOCTYPE_END) | ||||
|             fret; | ||||
|           }; | ||||
|         *|; | ||||
| 
 | ||||
|         # CDATA | ||||
|         # | ||||
|         # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections | ||||
|         # | ||||
|         # CDATA tags are broken up into 3 parts: the start, the content and the | ||||
|         # end tag. | ||||
|         # | ||||
|         # In HTML CDATA tags have no meaning/are not supported. Oga does | ||||
|         # support them but treats their contents as plain text. | ||||
|         # | ||||
|         cdata_start = '<![CDATA['; | ||||
|         cdata_end   = ']]>'; | ||||
| 
 | ||||
|         action start_cdata { | ||||
|           emit_buffer | ||||
|           add_token(:T_CDATA_START) | ||||
| 
 | ||||
|           start_buffer | ||||
| 
 | ||||
|           fcall cdata; | ||||
|         } | ||||
| 
 | ||||
|         # Machine that for processing the contents of CDATA tags. Everything | ||||
|         # inside a CDATA tag is treated as plain text. | ||||
|         cdata := |* | ||||
|           cdata_end => { | ||||
|             emit_buffer | ||||
|             add_token(:T_CDATA_END) | ||||
| 
 | ||||
|             fret; | ||||
|           }; | ||||
| 
 | ||||
|           any; | ||||
|         *|; | ||||
| 
 | ||||
|         # Comments | ||||
|         # | ||||
|         # http://www.w3.org/TR/html-markup/syntax.html#comments | ||||
|         # | ||||
|         # Comments are lexed into 3 parts: the start tag, the content and the end | ||||
|         # tag. | ||||
|         # | ||||
|         # Unlike the W3 specification these rules *do* allow character sequences | ||||
|         # such as `--` and `->`. Putting extra checks in for these sequences | ||||
|         # would actually make the rules/actions more complex. | ||||
|         # | ||||
|         comment_start = '<!--'; | ||||
|         comment_end   = '-->'; | ||||
| 
 | ||||
|         action start_comment { | ||||
|           emit_buffer | ||||
|           add_token(:T_COMMENT_START) | ||||
| 
 | ||||
|           start_buffer | ||||
| 
 | ||||
|           fcall comment; | ||||
|         } | ||||
| 
 | ||||
|         # Machine used for processing the contents of a comment. Everything | ||||
|         # inside a comment is treated as plain text (similar to CDATA tags). | ||||
|         comment := |* | ||||
|           comment_end => { | ||||
|             emit_buffer | ||||
|             add_token(:T_COMMENT_END) | ||||
| 
 | ||||
|             fret; | ||||
|           }; | ||||
| 
 | ||||
|           any; | ||||
|         *|; | ||||
| 
 | ||||
|         # XML declaration tags | ||||
|         # | ||||
|         # http://www.w3.org/TR/REC-xml/#sec-prolog-dtd | ||||
|         # | ||||
|         xml_decl_start = '<?xml'; | ||||
|         xml_decl_end   = '?>'; | ||||
| 
 | ||||
|         action start_xml_decl { | ||||
|           emit_buffer | ||||
|           add_token(:T_XML_DECL_START) | ||||
| 
 | ||||
|           start_buffer | ||||
| 
 | ||||
|           fcall xml_decl; | ||||
|         } | ||||
| 
 | ||||
|         # Machine that processes the contents of an XML declaration tag. | ||||
|         xml_decl := |* | ||||
|           xml_decl_end => { | ||||
|             emit_buffer | ||||
|             add_token(:T_XML_DECL_END) | ||||
| 
 | ||||
|             fret; | ||||
|           }; | ||||
| 
 | ||||
|           any; | ||||
|         *|; | ||||
| 
 | ||||
|         # Elements | ||||
|         # | ||||
|         # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements | ||||
|         # | ||||
| 
 | ||||
|         # Action that creates the tokens for the opening tag, name and namespace | ||||
|         # (if any). Remaining work is delegated to a dedicated machine. | ||||
|         action start_element { | ||||
|           emit_buffer | ||||
|           add_token(:T_ELEM_START) | ||||
| 
 | ||||
|           # Add the element name. If the name includes a namespace we'll break | ||||
|           # the name up into two separate tokens. | ||||
|           name = text(@ts + 1) | ||||
| 
 | ||||
|           if name.include?(':') | ||||
|             ns, name = name.split(':') | ||||
| 
 | ||||
|             add_token(:T_ELEM_NS, ns) | ||||
|           end | ||||
| 
 | ||||
|           @elements << name | ||||
| 
 | ||||
|           add_token(:T_ELEM_NAME, name) | ||||
| 
 | ||||
|           fcall element_head; | ||||
|         } | ||||
| 
 | ||||
|         element_name  = [a-zA-Z0-9\-_:]+; | ||||
|         element_start = '<' element_name; | ||||
| 
 | ||||
|         # Machine used for processing the characters inside a element head. An | ||||
|         # element head is everything between `<NAME` (where NAME is the element | ||||
|         # name) and `>`. | ||||
|         # | ||||
|         # For example, in `<p foo="bar">` the element head is ` foo="bar"`. | ||||
|         # | ||||
|         element_head := |* | ||||
|           whitespace | '='; | ||||
| 
 | ||||
|           newline => { advance_line }; | ||||
| 
 | ||||
|           # Attribute names. | ||||
|           element_name => { t(:T_ATTR) }; | ||||
| 
 | ||||
|           # Attribute values. | ||||
|           dquote => start_string_dquote; | ||||
|           squote => start_string_squote; | ||||
| 
 | ||||
|           # The closing character of the open tag. | ||||
|           ('>' | '/') => { | ||||
|             fhold; | ||||
|             fret; | ||||
|           }; | ||||
|         *|; | ||||
| 
 | ||||
|         main := |* | ||||
|           element_start  => start_element; | ||||
|           doctype_start  => start_doctype; | ||||
|           cdata_start    => start_cdata; | ||||
|           comment_start  => start_comment; | ||||
|           xml_decl_start => start_xml_decl; | ||||
| 
 | ||||
|           # Enter the body of the tag. If HTML mode is enabled and the current | ||||
|           # element is a void element we'll close it and bail out. | ||||
|           '>' => { | ||||
|             if html? and HTML_VOID_ELEMENTS.include?(current_element) | ||||
|               add_token(:T_ELEM_END, nil) | ||||
|               @elements.pop | ||||
|             end | ||||
|           }; | ||||
| 
 | ||||
|           # Regular closing tags. | ||||
|           '</' element_name '>' => { | ||||
|             emit_buffer | ||||
|             add_token(:T_ELEM_END, nil) | ||||
| 
 | ||||
|             @elements.pop | ||||
|           }; | ||||
| 
 | ||||
|           # Self closing elements that are not handled by the HTML mode. | ||||
|           '/>' => { | ||||
|             add_token(:T_ELEM_END, nil) | ||||
| 
 | ||||
|             @elements.pop | ||||
|           }; | ||||
| 
 | ||||
|           # Note that this rule should be declared at the very bottom as it will | ||||
|           # otherwise take precedence over the other rules. | ||||
|           any => { | ||||
|             # First character, start buffering (unless we already are buffering). | ||||
|             start_buffer(@ts) unless buffering? | ||||
| 
 | ||||
|             # EOF, emit the text buffer. | ||||
|             if @te == eof | ||||
|               emit_buffer(@te) | ||||
|             end | ||||
|           }; | ||||
|         *|; | ||||
|       }%% | ||||
|     end # Lexer | ||||
|   end # XML | ||||
| end # Oga | ||||
|  | @ -0,0 +1,402 @@ | |||
| # | ||||
| # DO NOT MODIFY!!!! | ||||
| # This file is automatically generated by Racc 1.4.11 | ||||
| # from Racc grammer file "". | ||||
| # | ||||
| 
 | ||||
| require 'racc/parser.rb' | ||||
| module Oga | ||||
|   module XML | ||||
|     class Parser < Racc::Parser | ||||
| 
 | ||||
|   ## | ||||
|   # @param [Hash] options | ||||
|   # | ||||
|   # @option options [TrueClass|FalseClass] :html Enables HTML parsing mode. | ||||
|   # @see Oga::Lexer#initialize | ||||
|   # | ||||
|   def initialize(options = {}) | ||||
|     @lexer = Lexer.new(options) | ||||
|   end | ||||
| 
 | ||||
|   ## | ||||
|   # Resets the internal state of the parser. | ||||
|   # | ||||
|   def reset | ||||
|     @lines = [] | ||||
|     @line  = 1 | ||||
|   end | ||||
| 
 | ||||
|   ## | ||||
|   # Emits a new AST token. | ||||
|   # | ||||
|   # @param [Symbol] type | ||||
|   # @param [Array] children | ||||
|   # | ||||
|   def s(type, *children) | ||||
|     return AST::Node.new( | ||||
|       type, | ||||
|       children.flatten, | ||||
|       :line => @line | ||||
|     ) | ||||
|   end | ||||
| 
 | ||||
|   ## | ||||
|   # Returns the next token from the lexer. | ||||
|   # | ||||
|   # @return [Array] | ||||
|   # | ||||
|   def next_token | ||||
|     type, value, line = @tokens.shift | ||||
| 
 | ||||
|     @line = line if line | ||||
| 
 | ||||
|     return type ? [type, value] : [false, false] | ||||
|   end | ||||
| 
 | ||||
|   ## | ||||
|   # @param [Fixnum] type The type of token the error occured on. | ||||
|   # @param [String] value The value of the token. | ||||
|   # @param [Array] stack The current stack of parsed nodes. | ||||
|   # @raise [Racc::ParseError] | ||||
|   # | ||||
|   def on_error(type, value, stack) | ||||
|     name  = token_to_str(type) | ||||
|     index = @line - 1 | ||||
|     lines = '' | ||||
| 
 | ||||
|     # Show up to 5 lines before and after the offending line (if they exist). | ||||
|     (-5..5).each do |offset| | ||||
|       line   = @lines[index + offset] | ||||
|       number = @line + offset | ||||
| 
 | ||||
|       if line and number > 0 | ||||
|         if offset == 0 | ||||
|           prefix = '=> ' | ||||
|         else | ||||
|           prefix = '   ' | ||||
|         end | ||||
| 
 | ||||
|         lines << "#{prefix}#{number}: #{line.strip}\n" | ||||
|       end | ||||
|     end | ||||
| 
 | ||||
|     raise Racc::ParseError, <<-EOF | ||||
| Unexpected #{name} with value #{value.inspect} on line #{@line}: | ||||
| 
 | ||||
| #{lines} | ||||
|     EOF | ||||
|   end | ||||
| 
 | ||||
|   ## | ||||
|   # Parses the supplied string and returns the AST. | ||||
|   # | ||||
|   # @example | ||||
|   #  parser = Oga::Parser.new | ||||
|   #  ast    = parser.parse('<foo>bar</foo>') | ||||
|   # | ||||
|   # @param [String] string | ||||
|   # @return [Oga::AST::Node] | ||||
|   # | ||||
|   def parse(string) | ||||
|     @lines  = string.lines | ||||
|     @tokens = @lexer.lex(string) | ||||
|     ast     = do_parse | ||||
| 
 | ||||
|     reset | ||||
| 
 | ||||
|     return ast | ||||
|   end | ||||
| 
 | ||||
| # vim: set ft=racc: | ||||
| ##### State transition tables begin ### | ||||
| 
 | ||||
| racc_action_table = [ | ||||
|     16,    40,    16,    10,    24,    37,    11,    22,    12,    28, | ||||
|     14,    23,    21,    45,    31,    15,    16,    10,    44,    28, | ||||
|     11,    43,    12,    36,    14,    35,    16,    10,    34,    15, | ||||
|     11,    41,    12,    42,    14,    33,    16,    10,    17,    15, | ||||
|     11,    46,    12,   nil,    14,    29,    30,    19,    20,    15 ] | ||||
| 
 | ||||
| racc_action_check = [ | ||||
|     15,    28,    38,    38,    12,    24,    38,    11,    38,    13, | ||||
|     38,    12,    11,    38,    15,    38,     2,     2,    35,    26, | ||||
|      2,    35,     2,    22,     2,    20,    25,    25,    20,     2, | ||||
|     25,    30,    25,    32,    25,    17,     0,     0,     1,    25, | ||||
|      0,    44,     0,   nil,     0,    14,    14,    10,    10,     0 ] | ||||
| 
 | ||||
| racc_action_pointer = [ | ||||
|     33,    38,    13,   nil,   nil,   nil,   nil,   nil,   nil,   nil, | ||||
|     42,     4,     1,    -6,    33,    -3,   nil,    35,   nil,   nil, | ||||
|     23,   nil,    15,   nil,    -5,    23,     4,   nil,    -1,   nil, | ||||
|     19,   nil,    16,   nil,   nil,    16,   nil,   nil,    -1,   nil, | ||||
|    nil,   nil,   nil,   nil,    36,   nil,   nil ] | ||||
| 
 | ||||
| racc_action_default = [ | ||||
|     -2,   -32,    -1,    -4,    -6,    -7,    -8,    -9,   -10,   -11, | ||||
|    -32,   -32,   -32,   -24,   -32,   -32,   -31,   -32,    -3,   -12, | ||||
|    -32,   -16,   -32,   -18,   -32,    -5,   -23,   -26,   -27,   -21, | ||||
|    -32,   -29,   -32,    47,   -13,   -32,   -17,   -19,   -32,   -25, | ||||
|    -28,   -22,   -30,   -14,   -32,   -20,   -15 ] | ||||
| 
 | ||||
| racc_goto_table = [ | ||||
|     18,     2,    27,    32,    25,    26,     1,   nil,   nil,   nil, | ||||
|    nil,   nil,   nil,   nil,   nil,    39,   nil,   nil,   nil,   nil, | ||||
|    nil,   nil,   nil,   nil,   nil,   nil,    38,   nil,   nil,   nil, | ||||
|    nil,   nil,   nil,   nil,   nil,   nil,    18 ] | ||||
| 
 | ||||
| racc_goto_check = [ | ||||
|      3,     2,    13,     8,    11,    12,     1,   nil,   nil,   nil, | ||||
|    nil,   nil,   nil,   nil,   nil,    13,   nil,   nil,   nil,   nil, | ||||
|    nil,   nil,   nil,   nil,   nil,   nil,     2,   nil,   nil,   nil, | ||||
|    nil,   nil,   nil,   nil,   nil,   nil,     3 ] | ||||
| 
 | ||||
| racc_goto_pointer = [ | ||||
|    nil,     6,     1,    -2,   nil,   nil,   nil,   nil,   -12,   nil, | ||||
|    nil,    -9,    -8,   -11 ] | ||||
| 
 | ||||
| racc_goto_default = [ | ||||
|    nil,   nil,   nil,     3,     4,     5,     6,     7,     8,     9, | ||||
|     13,   nil,   nil,   nil ] | ||||
| 
 | ||||
| racc_reduce_table = [ | ||||
|   0, 0, :racc_error, | ||||
|   1, 19, :_reduce_1, | ||||
|   0, 19, :_reduce_2, | ||||
|   2, 20, :_reduce_3, | ||||
|   1, 20, :_reduce_4, | ||||
|   0, 20, :_reduce_5, | ||||
|   1, 21, :_reduce_none, | ||||
|   1, 21, :_reduce_none, | ||||
|   1, 21, :_reduce_none, | ||||
|   1, 21, :_reduce_none, | ||||
|   1, 21, :_reduce_none, | ||||
|   1, 21, :_reduce_none, | ||||
|   2, 22, :_reduce_12, | ||||
|   3, 22, :_reduce_13, | ||||
|   4, 22, :_reduce_14, | ||||
|   5, 22, :_reduce_15, | ||||
|   2, 23, :_reduce_16, | ||||
|   3, 23, :_reduce_17, | ||||
|   2, 24, :_reduce_18, | ||||
|   3, 24, :_reduce_19, | ||||
|   4, 25, :_reduce_20, | ||||
|   2, 28, :_reduce_21, | ||||
|   3, 28, :_reduce_22, | ||||
|   1, 29, :_reduce_23, | ||||
|   0, 29, :_reduce_24, | ||||
|   2, 30, :_reduce_25, | ||||
|   1, 30, :_reduce_26, | ||||
|   1, 31, :_reduce_27, | ||||
|   2, 31, :_reduce_28, | ||||
|   2, 27, :_reduce_29, | ||||
|   3, 27, :_reduce_30, | ||||
|   1, 26, :_reduce_31 ] | ||||
| 
 | ||||
| racc_reduce_n = 32 | ||||
| 
 | ||||
| racc_shift_n = 47 | ||||
| 
 | ||||
| racc_token_table = { | ||||
|   false => 0, | ||||
|   :error => 1, | ||||
|   :T_STRING => 2, | ||||
|   :T_TEXT => 3, | ||||
|   :T_DOCTYPE_START => 4, | ||||
|   :T_DOCTYPE_END => 5, | ||||
|   :T_DOCTYPE_TYPE => 6, | ||||
|   :T_CDATA_START => 7, | ||||
|   :T_CDATA_END => 8, | ||||
|   :T_COMMENT_START => 9, | ||||
|   :T_COMMENT_END => 10, | ||||
|   :T_ELEM_START => 11, | ||||
|   :T_ELEM_NAME => 12, | ||||
|   :T_ELEM_NS => 13, | ||||
|   :T_ELEM_END => 14, | ||||
|   :T_ATTR => 15, | ||||
|   :T_XML_DECL_START => 16, | ||||
|   :T_XML_DECL_END => 17 } | ||||
| 
 | ||||
| racc_nt_base = 18 | ||||
| 
 | ||||
| racc_use_result_var = false | ||||
| 
 | ||||
| Racc_arg = [ | ||||
|   racc_action_table, | ||||
|   racc_action_check, | ||||
|   racc_action_default, | ||||
|   racc_action_pointer, | ||||
|   racc_goto_table, | ||||
|   racc_goto_check, | ||||
|   racc_goto_default, | ||||
|   racc_goto_pointer, | ||||
|   racc_nt_base, | ||||
|   racc_reduce_table, | ||||
|   racc_token_table, | ||||
|   racc_shift_n, | ||||
|   racc_reduce_n, | ||||
|   racc_use_result_var ] | ||||
| 
 | ||||
| Racc_token_to_s_table = [ | ||||
|   "$end", | ||||
|   "error", | ||||
|   "T_STRING", | ||||
|   "T_TEXT", | ||||
|   "T_DOCTYPE_START", | ||||
|   "T_DOCTYPE_END", | ||||
|   "T_DOCTYPE_TYPE", | ||||
|   "T_CDATA_START", | ||||
|   "T_CDATA_END", | ||||
|   "T_COMMENT_START", | ||||
|   "T_COMMENT_END", | ||||
|   "T_ELEM_START", | ||||
|   "T_ELEM_NAME", | ||||
|   "T_ELEM_NS", | ||||
|   "T_ELEM_END", | ||||
|   "T_ATTR", | ||||
|   "T_XML_DECL_START", | ||||
|   "T_XML_DECL_END", | ||||
|   "$start", | ||||
|   "document", | ||||
|   "expressions", | ||||
|   "expression", | ||||
|   "doctype", | ||||
|   "cdata", | ||||
|   "comment", | ||||
|   "element", | ||||
|   "text", | ||||
|   "xmldecl", | ||||
|   "element_open", | ||||
|   "attributes", | ||||
|   "attributes_", | ||||
|   "attribute" ] | ||||
| 
 | ||||
| Racc_debug_parser = false | ||||
| 
 | ||||
| ##### State transition tables end ##### | ||||
| 
 | ||||
| # reduce 0 omitted | ||||
| 
 | ||||
| def _reduce_1(val, _values) | ||||
|  s(:document, val[0])  | ||||
| end | ||||
| 
 | ||||
| def _reduce_2(val, _values) | ||||
|  s(:document)  | ||||
| end | ||||
| 
 | ||||
| def _reduce_3(val, _values) | ||||
|  val.compact  | ||||
| end | ||||
| 
 | ||||
| def _reduce_4(val, _values) | ||||
|  val[0]  | ||||
| end | ||||
| 
 | ||||
| def _reduce_5(val, _values) | ||||
|  nil  | ||||
| end | ||||
| 
 | ||||
| # reduce 6 omitted | ||||
| 
 | ||||
| # reduce 7 omitted | ||||
| 
 | ||||
| # reduce 8 omitted | ||||
| 
 | ||||
| # reduce 9 omitted | ||||
| 
 | ||||
| # reduce 10 omitted | ||||
| 
 | ||||
| # reduce 11 omitted | ||||
| 
 | ||||
| def _reduce_12(val, _values) | ||||
|  s(:doctype)  | ||||
| end | ||||
| 
 | ||||
| def _reduce_13(val, _values) | ||||
|         s(:doctype, val[1]) | ||||
|        | ||||
| end | ||||
| 
 | ||||
| def _reduce_14(val, _values) | ||||
|         s(:doctype, val[1], val[2]) | ||||
|        | ||||
| end | ||||
| 
 | ||||
| def _reduce_15(val, _values) | ||||
|         s(:doctype, val[1], val[2], val[3]) | ||||
|        | ||||
| end | ||||
| 
 | ||||
| def _reduce_16(val, _values) | ||||
|  s(:cdata)  | ||||
| end | ||||
| 
 | ||||
| def _reduce_17(val, _values) | ||||
|  s(:cdata, val[1])  | ||||
| end | ||||
| 
 | ||||
| def _reduce_18(val, _values) | ||||
|  s(:comment)  | ||||
| end | ||||
| 
 | ||||
| def _reduce_19(val, _values) | ||||
|  s(:comment, val[1])  | ||||
| end | ||||
| 
 | ||||
| def _reduce_20(val, _values) | ||||
|         s(:element, val[0], val[1], val[2]) | ||||
|        | ||||
| end | ||||
| 
 | ||||
| def _reduce_21(val, _values) | ||||
|  [nil, val[1]]  | ||||
| end | ||||
| 
 | ||||
| def _reduce_22(val, _values) | ||||
|  [val[1], val[2]]  | ||||
| end | ||||
| 
 | ||||
| def _reduce_23(val, _values) | ||||
|  s(:attributes, val[0])  | ||||
| end | ||||
| 
 | ||||
| def _reduce_24(val, _values) | ||||
|  nil  | ||||
| end | ||||
| 
 | ||||
| def _reduce_25(val, _values) | ||||
|  val  | ||||
| end | ||||
| 
 | ||||
| def _reduce_26(val, _values) | ||||
|  val  | ||||
| end | ||||
| 
 | ||||
| def _reduce_27(val, _values) | ||||
|  s(:attribute, val[0])  | ||||
| end | ||||
| 
 | ||||
| def _reduce_28(val, _values) | ||||
|  s(:attribute, val[0], val[1])  | ||||
| end | ||||
| 
 | ||||
| def _reduce_29(val, _values) | ||||
|  s(:xml_decl)  | ||||
| end | ||||
| 
 | ||||
| def _reduce_30(val, _values) | ||||
|  s(:xml_decl, val[1])  | ||||
| end | ||||
| 
 | ||||
| def _reduce_31(val, _values) | ||||
|  s(:text, val[0])  | ||||
| end | ||||
| 
 | ||||
| def _reduce_none(val, _values) | ||||
|   val[0] | ||||
| end | ||||
| 
 | ||||
|     end   # class Parser | ||||
|     end   # module XML | ||||
|   end   # module Oga | ||||
|  | @ -5,9 +5,9 @@ | |||
| # It requires every tag to have a closing tag. As such you'll need to enable | ||||
| # HTML parsing mode when parsing HTML. This can be done as following: | ||||
| # | ||||
| #     parser = Oga::Parser.new(:html => true) | ||||
| #     parser = Oga::XML::Parser.new(:html => true) | ||||
| # | ||||
| class Oga::Parser | ||||
| class Oga::XML::Parser | ||||
| 
 | ||||
| token T_STRING T_TEXT | ||||
| token T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE | ||||
|  | @ -1,6 +1,6 @@ | |||
| require 'spec_helper' | ||||
| 
 | ||||
| describe Oga::Lexer do | ||||
| describe Oga::XML::Lexer do | ||||
|   context 'cdata tags' do | ||||
|     example 'lex a cdata tag' do | ||||
|       lex('<![CDATA[foo]]>').should == [ | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| require 'spec_helper' | ||||
| 
 | ||||
| describe Oga::Lexer do | ||||
| describe Oga::XML::Lexer do | ||||
|   context 'comments' do | ||||
|     example 'lex a comment' do | ||||
|       lex('<!-- foo -->').should == [ | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| require 'spec_helper' | ||||
| 
 | ||||
| describe Oga::Lexer do | ||||
| describe Oga::XML::Lexer do | ||||
|   context 'doctypes' do | ||||
|     example 'lex the HTML5 doctype' do | ||||
|       lex('<!DOCTYPE html>').should == [ | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| require 'spec_helper' | ||||
| 
 | ||||
| describe Oga::Lexer do | ||||
| describe Oga::XML::Lexer do | ||||
|   context 'HTML documents' do | ||||
|     example 'lex a basic HTML document' do | ||||
|       html = <<-EOF | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| require 'spec_helper' | ||||
| 
 | ||||
| describe Oga::Lexer do | ||||
| describe Oga::XML::Lexer do | ||||
|   context 'elements' do | ||||
|     example 'lex an opening element' do | ||||
|       lex('<p>').should == [ | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| require 'spec_helper' | ||||
| 
 | ||||
| describe Oga::Lexer do | ||||
| describe Oga::XML::Lexer do | ||||
|   context 'regular text' do | ||||
|     example 'lex regular text' do | ||||
|       lex('hello').should == [[:T_TEXT, 'hello', 1]] | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| require 'spec_helper' | ||||
| 
 | ||||
| describe Oga::Lexer do | ||||
| describe Oga::XML::Lexer do | ||||
|   context 'HTML void elements' do | ||||
|     example 'lex a void element that omits the closing /' do | ||||
|       lex('<link>', :html => true).should == [ | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| require 'spec_helper' | ||||
| 
 | ||||
| describe Oga::Lexer do | ||||
| describe Oga::XML::Lexer do | ||||
|   context 'XML declaration tags' do | ||||
|     example 'lex a start tag' do | ||||
|       lex('<?xml').should == [[:T_XML_DECL_START, nil, 1]] | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| require 'spec_helper' | ||||
| 
 | ||||
| describe Oga::Parser do | ||||
| describe Oga::XML::Parser do | ||||
|   context 'cdata tags' do | ||||
|     example 'parse a cdata tag' do | ||||
|       parse('<![CDATA[foo]]>').should == s(:document, s(:cdata, 'foo')) | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| require 'spec_helper' | ||||
| 
 | ||||
| describe Oga::Parser do | ||||
| describe Oga::XML::Parser do | ||||
|   context 'comments' do | ||||
|     example 'parse an empty comment' do | ||||
|       parse('<!---->').should == s(:document, s(:comment)) | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| require 'spec_helper' | ||||
| 
 | ||||
| describe Oga::Parser do | ||||
| describe Oga::XML::Parser do | ||||
|   context 'doctypes' do | ||||
|     example 'parse a doctype' do | ||||
|       parse('<!DOCTYPE html>').should == s(:document, s(:doctype)) | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| require 'spec_helper' | ||||
| 
 | ||||
| describe Oga::Parser do | ||||
| describe Oga::XML::Parser do | ||||
|   context 'HTML documents' do | ||||
|     example 'parse a basic HTML document' do | ||||
|       html = <<-EOF | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| require 'spec_helper' | ||||
| 
 | ||||
| describe Oga::Parser do | ||||
| describe Oga::XML::Parser do | ||||
|   context 'elements' do | ||||
|     example 'parse an empty element' do | ||||
|       parse('<p></p>').should == s( | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| require 'spec_helper' | ||||
| 
 | ||||
| describe Oga::Parser do | ||||
| describe Oga::XML::Parser do | ||||
|   example 'parse regular text' do | ||||
|     parse('foo').should == s(:document, s(:text, 'foo')) | ||||
|   end | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| require 'spec_helper' | ||||
| 
 | ||||
| describe Oga::Parser do | ||||
| describe Oga::XML::Parser do | ||||
|   context 'HTML void elements' do | ||||
|     example 'parse a void element that omits the closing /' do | ||||
|       parse('<link>', :html => true).should == s( | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| require 'spec_helper' | ||||
| 
 | ||||
| describe Oga::Parser do | ||||
| describe Oga::XML::Parser do | ||||
|   context 'XML declaration tags' do | ||||
|     example 'lex an XML declaration tag' do | ||||
|       parse('<?xml hello ?>').should == s( | ||||
|  |  | |||
|  | @ -19,7 +19,7 @@ module Oga | |||
|     # @return [Array] | ||||
|     # | ||||
|     def lex(input, options = {}) | ||||
|       return Oga::Lexer.new(options).lex(input) | ||||
|       return Oga::XML::Lexer.new(options).lex(input) | ||||
|     end | ||||
| 
 | ||||
|     ## | ||||
|  | @ -30,7 +30,7 @@ module Oga | |||
|     # @return [Oga::AST::Node] | ||||
|     # | ||||
|     def parse(input, options = {}) | ||||
|       return Oga::Parser.new(options).parse(input) | ||||
|       return Oga::XML::Parser.new(options).parse(input) | ||||
|     end | ||||
|   end # ParsingHelpers | ||||
| end # Oga | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue