Namespaced the lexer/parser under Oga::XML.

With the upcoming XPath and CSS selector lexers/parsers it will be confusing to keep these in the root namespace.
2014-03-25 09:34:38 +01:00 · 2014-03-25 09:34:38 +01:00 · eae13d21ed
parent 2259061c89
commit eae13d21ed
28 changed files with 2049 additions and 537 deletions
--- a/6
+++ b/6
@ -5,10 +5,10 @@ require 'cliver'

 GEMSPEC = Gem::Specification.load('oga.gemspec')

-LEXER_INPUT  = 'lib/oga/lexer.rl'
-LEXER_OUTPUT = 'lib/oga/lexer.rb'
+LEXER_INPUT  = 'lib/oga/xml/lexer.rl'
+LEXER_OUTPUT = 'lib/oga/xml/lexer.rb'

-HTML_PARSER = 'lib/oga/parser.rb'
+HTML_PARSER = 'lib/oga/xml/parser.rb'

 GENERATED_FILES = ['coverage', 'yardoc', LEXER_OUTPUT, HTML_PARSER]

--- a/benchmark/lexer/bench_cdata.rb
+++ b/benchmark/lexer/bench_cdata.rb
@ -5,7 +5,7 @@ string = 'Hello, how are you doing today?'
 small  = "<![CDATA[#{string}]]>"
 medium = "<![CDATA[#{string * 1_000}]]>"
 large  = "<![CDATA[#{string * 10_000}]]>"
-lexer  = Oga::Lexer.new
+lexer  = Oga::XML::Lexer.new

 Benchmark.ips do |bench|
  bench.report 'CDATA with a small body' do
--- a/benchmark/lexer/bench_element.rb
+++ b/benchmark/lexer/bench_element.rb
@ -4,7 +4,7 @@ require 'benchmark/ips'
 simple     = '<p>Hello world</p>'
 attributes = '<p class="foo">Hello world</p>'
 nested     = '<p>Hello<strong>world</strong></p>'
-lexer      = Oga::Lexer.new
+lexer      = Oga::XML::Lexer.new

 Benchmark.ips do |bench|
  bench.report 'text only' do
--- a/benchmark/lexer/bench_html.rb
+++ b/benchmark/lexer/bench_html.rb
@ -2,7 +2,7 @@ require_relative '../../lib/oga'
 require 'benchmark/ips'

 html  = File.read(File.expand_path('../../fixtures/hrs.html', __FILE__))
-lexer = Oga::Lexer.new(:html => true)
+lexer = Oga::XML::Lexer.new(:html => true)

 Benchmark.ips do |bench|
  bench.report 'lex HTML' do
--- a/benchmark/lexer/bench_html_time.rb
+++ b/benchmark/lexer/bench_html_time.rb
@ -2,7 +2,7 @@ require_relative '../../lib/oga'
 require 'benchmark'

 html  = File.read(File.expand_path('../../fixtures/hrs.html', __FILE__))
-lexer = Oga::Lexer.new(:html => true)
+lexer = Oga::XML::Lexer.new(:html => true)

 Benchmark.bmbm(20) do |bench|
  bench.report 'lex HTML' do
--- a/lib/oga.rb
+++ b/lib/oga.rb
@ -1,5 +1,5 @@
 require 'ast'

 require_relative 'oga/ast/node'
-require_relative 'oga/lexer'
-require_relative 'oga/parser'
+require_relative 'oga/xml/lexer'
+require_relative 'oga/xml/parser'
--- a/lib/oga/lexer.rl
+++ b/lib/oga/lexer.rl
@ -1,508 +0,0 @@
-%%machine lexer; # %
-
-module Oga
-  ##
-  # Low level lexer that supports both XML and HTML (using an extra option). To
-  # lex HTML input set the `:html` option to `true` when creating an instance
-  # of the lexer:
-  #
-  #     lexer = Oga::Lexer.new(:html => true)
-  #
-  # @!attribute [r] html
-  #  @return [TrueClass|FalseClass]
-  #
-  class Lexer
-    %% write data; # %
-
-    attr_reader :html
-
-    ##
-    # Names of the HTML void elements that should be handled when HTML lexing
-    # is enabled.
-    #
-    # @return [Array]
-    #
-    HTML_VOID_ELEMENTS = [
-      'area',
-      'base',
-      'br',
-      'col',
-      'command',
-      'embed',
-      'hr',
-      'img',
-      'input',
-      'keygen',
-      'link',
-      'meta',
-      'param',
-      'source',
-      'track',
-      'wbr'
-    ]
-
-    # Lazy way of forwarding instance method calls used internally by Ragel to
-    # their corresponding class methods.
-    private_methods.grep(/^_lexer_/).each do |name|
-      define_method(name) do
-        return self.class.send(name)
-      end
-
-      private(name)
-    end
-
-    ##
-    # @param [Hash] options
-    #
-    # @option options [Symbol] :html When set to `true` the lexer will treat
-    #  the input as HTML instead of SGML/XML. This makes it possible to lex
-    #  HTML void elements such as `<link href="">`.
-    #
-    def initialize(options = {})
-      options.each do |key, value|
-        instance_variable_set("@#{key}", value) if respond_to?(key)
-      end
-
-      reset
-    end
-
-    ##
-    # Resets the internal state of the lexer. Typically you don't need to call
-    # this method yourself as its called by #lex after lexing a given String.
-    #
-    def reset
-      @line     = 1
-      @data     = nil
-      @ts       = nil
-      @te       = nil
-      @tokens   = []
-      @stack    = []
-      @top      = 0
-      @elements = []
-
-      @buffer_start_position = nil
-    end
-
-    ##
-    # Lexes the supplied String and returns an Array of tokens. Each token is
-    # an Array in the following format:
-    #
-    #     [TYPE, VALUE]
-    #
-    # The type is a symbol, the value is either nil or a String.
-    #
-    # @param [String] data The string to lex.
-    # @return [Array]
-    #
-    def lex(data)
-      @data       = data.unpack('U*')
-      lexer_start = self.class.lexer_start
-      eof         = data.length
-
-      %% write init;
-      %% write exec;
-
-      tokens = @tokens
-
-      reset
-
-      return tokens
-    end
-
-    ##
-    # @return [TrueClass|FalseClass]
-    #
-    def html?
-      return !!html
-    end
-
-    private
-
-    ##
-    # @param [Fixnum] amount The amount of lines to advance.
-    #
-    def advance_line(amount = 1)
-      @line += amount
-    end
-
-    ##
-    # Emits a token who's value is based on the supplied start/stop position.
-    #
-    # @param [Symbol] type The token type.
-    # @param [Fixnum] start
-    # @param [Fixnum] stop
-    #
-    # @see #text
-    # @see #add_token
-    #
-    def t(type, start = @ts, stop = @te)
-      value = text(start, stop)
-
-      add_token(type, value)
-    end
-
-    ##
-    # Returns the text of the current buffer based on the supplied start and
-    # stop position.
-    #
-    # By default `@ts` and `@te` are used as the start/stop position.
-    #
-    # @param [Fixnum] start
-    # @param [Fixnum] stop
-    # @return [String]
-    #
-    def text(start = @ts, stop = @te)
-      return @data[start...stop].pack('U*')
-    end
-
-    ##
-    # Adds a token with the given type and value to the list.
-    #
-    # @param [Symbol] type The token type.
-    # @param [String] value The token value.
-    #
-    def add_token(type, value = nil)
-      token = [type, value, @line]
-
-      @tokens << token
-    end
-
-    ##
-    # Enables buffering starting at the given position.
-    #
-    # @param [Fixnum] position The start position of the buffer, set to `@te`
-    #  by default.
-    #
-    def start_buffer(position = @te)
-      @buffer_start_position = position
-    end
-
-    ##
-    # Returns `true` if we're currently buffering.
-    #
-    # @return [TrueClass|FalseClass]
-    #
-    def buffering?
-      return !!@buffer_start_position
-    end
-
-    ##
-    # Emits the current buffer if we have any. The current line number is
-    # advanced based on the amount of newlines in the buffer.
-    #
-    # @param [Fixnum] position The end position of the buffer, set to `@ts` by
-    #  default.
-    #
-    # @param [Symbol] type The type of node to emit.
-    #
-    def emit_buffer(position = @ts, type = :T_TEXT)
-      return unless @buffer_start_position
-
-      content = text(@buffer_start_position, position)
-
-      unless content.empty?
-        add_token(type, content)
-
-        lines = content.count("\n")
-
-        advance_line(lines) if lines > 0
-      end
-
-      @buffer_start_position = nil
-    end
-
-    ##
-    # Returns the name of the element we're currently in.
-    #
-    # @return [String]
-    #
-    def current_element
-      return @elements.last
-    end
-
-    %%{
-      # Use instance variables for `ts` and friends.
-      access @;
-      getkey (@data[p] || 0);
-
-      newline    = '\n' | '\r\n';
-      whitespace = [ \t];
-
-      # Strings
-      #
-      # Strings in HTML can either be single or double quoted. If a string
-      # starts with one of these quotes it must be closed with the same type of
-      # quote.
-      dquote = '"';
-      squote = "'";
-
-      action start_string_dquote {
-        start_buffer
-
-        fcall string_dquote;
-      }
-
-      action start_string_squote {
-        start_buffer
-
-        fcall string_squote;
-      }
-
-      # Machine for processing double quoted strings.
-      string_dquote := |*
-        dquote => {
-          emit_buffer(@ts, :T_STRING)
-          fret;
-        };
-
-        any;
-      *|;
-
-      # Machine for processing single quoted strings.
-      string_squote := |*
-        squote => {
-          emit_buffer(@ts, :T_STRING)
-          fret;
-        };
-
-        any;
-      *|;
-
-      # DOCTYPES
-      #
-      # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
-      #
-      # These rules support the 3 flavours of doctypes:
-      #
-      # 1. Normal doctypes, as introduced in the HTML5 specification.
-      # 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
-      # 3. Legacy doctypes
-      #
-      doctype_start = '<!DOCTYPE'i whitespace+ 'HTML'i;
-
-      action start_doctype {
-        emit_buffer
-        add_token(:T_DOCTYPE_START)
-        fcall doctype;
-      }
-
-      # Machine for processing doctypes. Doctype values such as the public and
-      # system IDs are treated as T_STRING tokens.
-      doctype := |*
-        'PUBLIC' | 'SYSTEM' => { t(:T_DOCTYPE_TYPE) };
-
-        # Lex the public/system IDs as regular strings.
-        dquote => start_string_dquote;
-        squote => start_string_squote;
-
-        # Whitespace inside doctypes is ignored since there's no point in
-        # including it.
-        whitespace;
-
-        '>' => {
-          add_token(:T_DOCTYPE_END)
-          fret;
-        };
-      *|;
-
-      # CDATA
-      #
-      # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
-      #
-      # CDATA tags are broken up into 3 parts: the start, the content and the
-      # end tag.
-      #
-      # In HTML CDATA tags have no meaning/are not supported. Oga does
-      # support them but treats their contents as plain text.
-      #
-      cdata_start = '<![CDATA[';
-      cdata_end   = ']]>';
-
-      action start_cdata {
-        emit_buffer
-        add_token(:T_CDATA_START)
-
-        start_buffer
-
-        fcall cdata;
-      }
-
-      # Machine that for processing the contents of CDATA tags. Everything
-      # inside a CDATA tag is treated as plain text.
-      cdata := |*
-        cdata_end => {
-          emit_buffer
-          add_token(:T_CDATA_END)
-
-          fret;
-        };
-
-        any;
-      *|;
-
-      # Comments
-      #
-      # http://www.w3.org/TR/html-markup/syntax.html#comments
-      #
-      # Comments are lexed into 3 parts: the start tag, the content and the end
-      # tag.
-      #
-      # Unlike the W3 specification these rules *do* allow character sequences
-      # such as `--` and `->`. Putting extra checks in for these sequences
-      # would actually make the rules/actions more complex.
-      #
-      comment_start = '<!--';
-      comment_end   = '-->';
-
-      action start_comment {
-        emit_buffer
-        add_token(:T_COMMENT_START)
-
-        start_buffer
-
-        fcall comment;
-      }
-
-      # Machine used for processing the contents of a comment. Everything
-      # inside a comment is treated as plain text (similar to CDATA tags).
-      comment := |*
-        comment_end => {
-          emit_buffer
-          add_token(:T_COMMENT_END)
-
-          fret;
-        };
-
-        any;
-      *|;
-
-      # XML declaration tags
-      #
-      # http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
-      #
-      xml_decl_start = '<?xml';
-      xml_decl_end   = '?>';
-
-      action start_xml_decl {
-        emit_buffer
-        add_token(:T_XML_DECL_START)
-
-        start_buffer
-
-        fcall xml_decl;
-      }
-
-      # Machine that processes the contents of an XML declaration tag.
-      xml_decl := |*
-        xml_decl_end => {
-          emit_buffer
-          add_token(:T_XML_DECL_END)
-
-          fret;
-        };
-
-        any;
-      *|;
-
-      # Elements
-      #
-      # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
-      #
-
-      # Action that creates the tokens for the opening tag, name and namespace
-      # (if any). Remaining work is delegated to a dedicated machine.
-      action start_element {
-        emit_buffer
-        add_token(:T_ELEM_START)
-
-        # Add the element name. If the name includes a namespace we'll break
-        # the name up into two separate tokens.
-        name = text(@ts + 1)
-
-        if name.include?(':')
-          ns, name = name.split(':')
-
-          add_token(:T_ELEM_NS, ns)
-        end
-
-        @elements << name
-
-        add_token(:T_ELEM_NAME, name)
-
-        fcall element_head;
-      }
-
-      element_name  = [a-zA-Z0-9\-_:]+;
-      element_start = '<' element_name;
-
-      # Machine used for processing the characters inside a element head. An
-      # element head is everything between `<NAME` (where NAME is the element
-      # name) and `>`.
-      #
-      # For example, in `<p foo="bar">` the element head is ` foo="bar"`.
-      #
-      element_head := |*
-        whitespace | '=';
-
-        newline => { advance_line };
-
-        # Attribute names.
-        element_name => { t(:T_ATTR) };
-
-        # Attribute values.
-        dquote => start_string_dquote;
-        squote => start_string_squote;
-
-        # The closing character of the open tag.
-        ('>' | '/') => {
-          fhold;
-          fret;
-        };
-      *|;
-
-      main := |*
-        element_start  => start_element;
-        doctype_start  => start_doctype;
-        cdata_start    => start_cdata;
-        comment_start  => start_comment;
-        xml_decl_start => start_xml_decl;
-
-        # Enter the body of the tag. If HTML mode is enabled and the current
-        # element is a void element we'll close it and bail out.
-        '>' => {
-          if html? and HTML_VOID_ELEMENTS.include?(current_element)
-            add_token(:T_ELEM_END, nil)
-            @elements.pop
-          end
-        };
-
-        # Regular closing tags.
-        '</' element_name '>' => {
-          emit_buffer
-          add_token(:T_ELEM_END, nil)
-
-          @elements.pop
-        };
-
-        # Self closing elements that are not handled by the HTML mode.
-        '/>' => {
-          add_token(:T_ELEM_END, nil)
-
-          @elements.pop
-        };
-
-        # Note that this rule should be declared at the very bottom as it will
-        # otherwise take precedence over the other rules.
-        any => {
-          # First character, start buffering (unless we already are buffering).
-          start_buffer(@ts) unless buffering?
-
-          # EOF, emit the text buffer.
-          if @te == eof
-            emit_buffer(@te)
-          end
-        };
-      *|;
-    }%%
-  end # Lexer
-end # Oga
--- a/lib/oga/xml/lexer.rb
+++ b/lib/oga/xml/lexer.rb
--- a/lib/oga/xml/lexer.rl
+++ b/lib/oga/xml/lexer.rl
@ -0,0 +1,510 @@
+%%machine lexer; # %
+
+module Oga
+  module XML
+    ##
+    # Low level lexer that supports both XML and HTML (using an extra option). To
+    # lex HTML input set the `:html` option to `true` when creating an instance
+    # of the lexer:
+    #
+    #     lexer = Oga::Lexer.new(:html => true)
+    #
+    # @!attribute [r] html
+    #  @return [TrueClass|FalseClass]
+    #
+    class Lexer
+      %% write data; # %
+
+      attr_reader :html
+
+      ##
+      # Names of the HTML void elements that should be handled when HTML lexing
+      # is enabled.
+      #
+      # @return [Array]
+      #
+      HTML_VOID_ELEMENTS = [
+        'area',
+        'base',
+        'br',
+        'col',
+        'command',
+        'embed',
+        'hr',
+        'img',
+        'input',
+        'keygen',
+        'link',
+        'meta',
+        'param',
+        'source',
+        'track',
+        'wbr'
+      ]
+
+      # Lazy way of forwarding instance method calls used internally by Ragel to
+      # their corresponding class methods.
+      private_methods.grep(/^_lexer_/).each do |name|
+        define_method(name) do
+          return self.class.send(name)
+        end
+
+        private(name)
+      end
+
+      ##
+      # @param [Hash] options
+      #
+      # @option options [Symbol] :html When set to `true` the lexer will treat
+      #  the input as HTML instead of SGML/XML. This makes it possible to lex
+      #  HTML void elements such as `<link href="">`.
+      #
+      def initialize(options = {})
+        options.each do |key, value|
+          instance_variable_set("@#{key}", value) if respond_to?(key)
+        end
+
+        reset
+      end
+
+      ##
+      # Resets the internal state of the lexer. Typically you don't need to call
+      # this method yourself as its called by #lex after lexing a given String.
+      #
+      def reset
+        @line     = 1
+        @data     = nil
+        @ts       = nil
+        @te       = nil
+        @tokens   = []
+        @stack    = []
+        @top      = 0
+        @elements = []
+
+        @buffer_start_position = nil
+      end
+
+      ##
+      # Lexes the supplied String and returns an Array of tokens. Each token is
+      # an Array in the following format:
+      #
+      #     [TYPE, VALUE]
+      #
+      # The type is a symbol, the value is either nil or a String.
+      #
+      # @param [String] data The string to lex.
+      # @return [Array]
+      #
+      def lex(data)
+        @data       = data.unpack('U*')
+        lexer_start = self.class.lexer_start
+        eof         = data.length
+
+        %% write init;
+        %% write exec;
+
+        tokens = @tokens
+
+        reset
+
+        return tokens
+      end
+
+      ##
+      # @return [TrueClass|FalseClass]
+      #
+      def html?
+        return !!html
+      end
+
+      private
+
+      ##
+      # @param [Fixnum] amount The amount of lines to advance.
+      #
+      def advance_line(amount = 1)
+        @line += amount
+      end
+
+      ##
+      # Emits a token who's value is based on the supplied start/stop position.
+      #
+      # @param [Symbol] type The token type.
+      # @param [Fixnum] start
+      # @param [Fixnum] stop
+      #
+      # @see #text
+      # @see #add_token
+      #
+      def t(type, start = @ts, stop = @te)
+        value = text(start, stop)
+
+        add_token(type, value)
+      end
+
+      ##
+      # Returns the text of the current buffer based on the supplied start and
+      # stop position.
+      #
+      # By default `@ts` and `@te` are used as the start/stop position.
+      #
+      # @param [Fixnum] start
+      # @param [Fixnum] stop
+      # @return [String]
+      #
+      def text(start = @ts, stop = @te)
+        return @data[start...stop].pack('U*')
+      end
+
+      ##
+      # Adds a token with the given type and value to the list.
+      #
+      # @param [Symbol] type The token type.
+      # @param [String] value The token value.
+      #
+      def add_token(type, value = nil)
+        token = [type, value, @line]
+
+        @tokens << token
+      end
+
+      ##
+      # Enables buffering starting at the given position.
+      #
+      # @param [Fixnum] position The start position of the buffer, set to `@te`
+      #  by default.
+      #
+      def start_buffer(position = @te)
+        @buffer_start_position = position
+      end
+
+      ##
+      # Returns `true` if we're currently buffering.
+      #
+      # @return [TrueClass|FalseClass]
+      #
+      def buffering?
+        return !!@buffer_start_position
+      end
+
+      ##
+      # Emits the current buffer if we have any. The current line number is
+      # advanced based on the amount of newlines in the buffer.
+      #
+      # @param [Fixnum] position The end position of the buffer, set to `@ts` by
+      #  default.
+      #
+      # @param [Symbol] type The type of node to emit.
+      #
+      def emit_buffer(position = @ts, type = :T_TEXT)
+        return unless @buffer_start_position
+
+        content = text(@buffer_start_position, position)
+
+        unless content.empty?
+          add_token(type, content)
+
+          lines = content.count("\n")
+
+          advance_line(lines) if lines > 0
+        end
+
+        @buffer_start_position = nil
+      end
+
+      ##
+      # Returns the name of the element we're currently in.
+      #
+      # @return [String]
+      #
+      def current_element
+        return @elements.last
+      end
+
+      %%{
+        # Use instance variables for `ts` and friends.
+        access @;
+        getkey (@data[p] || 0);
+
+        newline    = '\n' | '\r\n';
+        whitespace = [ \t];
+
+        # Strings
+        #
+        # Strings in HTML can either be single or double quoted. If a string
+        # starts with one of these quotes it must be closed with the same type of
+        # quote.
+        dquote = '"';
+        squote = "'";
+
+        action start_string_dquote {
+          start_buffer
+
+          fcall string_dquote;
+        }
+
+        action start_string_squote {
+          start_buffer
+
+          fcall string_squote;
+        }
+
+        # Machine for processing double quoted strings.
+        string_dquote := |*
+          dquote => {
+            emit_buffer(@ts, :T_STRING)
+            fret;
+          };
+
+          any;
+        *|;
+
+        # Machine for processing single quoted strings.
+        string_squote := |*
+          squote => {
+            emit_buffer(@ts, :T_STRING)
+            fret;
+          };
+
+          any;
+        *|;
+
+        # DOCTYPES
+        #
+        # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
+        #
+        # These rules support the 3 flavours of doctypes:
+        #
+        # 1. Normal doctypes, as introduced in the HTML5 specification.
+        # 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
+        # 3. Legacy doctypes
+        #
+        doctype_start = '<!DOCTYPE'i whitespace+ 'HTML'i;
+
+        action start_doctype {
+          emit_buffer
+          add_token(:T_DOCTYPE_START)
+          fcall doctype;
+        }
+
+        # Machine for processing doctypes. Doctype values such as the public and
+        # system IDs are treated as T_STRING tokens.
+        doctype := |*
+          'PUBLIC' | 'SYSTEM' => { t(:T_DOCTYPE_TYPE) };
+
+          # Lex the public/system IDs as regular strings.
+          dquote => start_string_dquote;
+          squote => start_string_squote;
+
+          # Whitespace inside doctypes is ignored since there's no point in
+          # including it.
+          whitespace;
+
+          '>' => {
+            add_token(:T_DOCTYPE_END)
+            fret;
+          };
+        *|;
+
+        # CDATA
+        #
+        # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
+        #
+        # CDATA tags are broken up into 3 parts: the start, the content and the
+        # end tag.
+        #
+        # In HTML CDATA tags have no meaning/are not supported. Oga does
+        # support them but treats their contents as plain text.
+        #
+        cdata_start = '<![CDATA[';
+        cdata_end   = ']]>';
+
+        action start_cdata {
+          emit_buffer
+          add_token(:T_CDATA_START)
+
+          start_buffer
+
+          fcall cdata;
+        }
+
+        # Machine that for processing the contents of CDATA tags. Everything
+        # inside a CDATA tag is treated as plain text.
+        cdata := |*
+          cdata_end => {
+            emit_buffer
+            add_token(:T_CDATA_END)
+
+            fret;
+          };
+
+          any;
+        *|;
+
+        # Comments
+        #
+        # http://www.w3.org/TR/html-markup/syntax.html#comments
+        #
+        # Comments are lexed into 3 parts: the start tag, the content and the end
+        # tag.
+        #
+        # Unlike the W3 specification these rules *do* allow character sequences
+        # such as `--` and `->`. Putting extra checks in for these sequences
+        # would actually make the rules/actions more complex.
+        #
+        comment_start = '<!--';
+        comment_end   = '-->';
+
+        action start_comment {
+          emit_buffer
+          add_token(:T_COMMENT_START)
+
+          start_buffer
+
+          fcall comment;
+        }
+
+        # Machine used for processing the contents of a comment. Everything
+        # inside a comment is treated as plain text (similar to CDATA tags).
+        comment := |*
+          comment_end => {
+            emit_buffer
+            add_token(:T_COMMENT_END)
+
+            fret;
+          };
+
+          any;
+        *|;
+
+        # XML declaration tags
+        #
+        # http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
+        #
+        xml_decl_start = '<?xml';
+        xml_decl_end   = '?>';
+
+        action start_xml_decl {
+          emit_buffer
+          add_token(:T_XML_DECL_START)
+
+          start_buffer
+
+          fcall xml_decl;
+        }
+
+        # Machine that processes the contents of an XML declaration tag.
+        xml_decl := |*
+          xml_decl_end => {
+            emit_buffer
+            add_token(:T_XML_DECL_END)
+
+            fret;
+          };
+
+          any;
+        *|;
+
+        # Elements
+        #
+        # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
+        #
+
+        # Action that creates the tokens for the opening tag, name and namespace
+        # (if any). Remaining work is delegated to a dedicated machine.
+        action start_element {
+          emit_buffer
+          add_token(:T_ELEM_START)
+
+          # Add the element name. If the name includes a namespace we'll break
+          # the name up into two separate tokens.
+          name = text(@ts + 1)
+
+          if name.include?(':')
+            ns, name = name.split(':')
+
+            add_token(:T_ELEM_NS, ns)
+          end
+
+          @elements << name
+
+          add_token(:T_ELEM_NAME, name)
+
+          fcall element_head;
+        }
+
+        element_name  = [a-zA-Z0-9\-_:]+;
+        element_start = '<' element_name;
+
+        # Machine used for processing the characters inside a element head. An
+        # element head is everything between `<NAME` (where NAME is the element
+        # name) and `>`.
+        #
+        # For example, in `<p foo="bar">` the element head is ` foo="bar"`.
+        #
+        element_head := |*
+          whitespace | '=';
+
+          newline => { advance_line };
+
+          # Attribute names.
+          element_name => { t(:T_ATTR) };
+
+          # Attribute values.
+          dquote => start_string_dquote;
+          squote => start_string_squote;
+
+          # The closing character of the open tag.
+          ('>' | '/') => {
+            fhold;
+            fret;
+          };
+        *|;
+
+        main := |*
+          element_start  => start_element;
+          doctype_start  => start_doctype;
+          cdata_start    => start_cdata;
+          comment_start  => start_comment;
+          xml_decl_start => start_xml_decl;
+
+          # Enter the body of the tag. If HTML mode is enabled and the current
+          # element is a void element we'll close it and bail out.
+          '>' => {
+            if html? and HTML_VOID_ELEMENTS.include?(current_element)
+              add_token(:T_ELEM_END, nil)
+              @elements.pop
+            end
+          };
+
+          # Regular closing tags.
+          '</' element_name '>' => {
+            emit_buffer
+            add_token(:T_ELEM_END, nil)
+
+            @elements.pop
+          };
+
+          # Self closing elements that are not handled by the HTML mode.
+          '/>' => {
+            add_token(:T_ELEM_END, nil)
+
+            @elements.pop
+          };
+
+          # Note that this rule should be declared at the very bottom as it will
+          # otherwise take precedence over the other rules.
+          any => {
+            # First character, start buffering (unless we already are buffering).
+            start_buffer(@ts) unless buffering?
+
+            # EOF, emit the text buffer.
+            if @te == eof
+              emit_buffer(@te)
+            end
+          };
+        *|;
+      }%%
+    end # Lexer
+  end # XML
+end # Oga
--- a/lib/oga/xml/parser.rb
+++ b/lib/oga/xml/parser.rb
@ -0,0 +1,402 @@
+#
+# DO NOT MODIFY!!!!
+# This file is automatically generated by Racc 1.4.11
+# from Racc grammer file "".
+#
+
+require 'racc/parser.rb'
+module Oga
+  module XML
+    class Parser < Racc::Parser
+
+  ##
+  # @param [Hash] options
+  #
+  # @option options [TrueClass|FalseClass] :html Enables HTML parsing mode.
+  # @see Oga::Lexer#initialize
+  #
+  def initialize(options = {})
+    @lexer = Lexer.new(options)
+  end
+
+  ##
+  # Resets the internal state of the parser.
+  #
+  def reset
+    @lines = []
+    @line  = 1
+  end
+
+  ##
+  # Emits a new AST token.
+  #
+  # @param [Symbol] type
+  # @param [Array] children
+  #
+  def s(type, *children)
+    return AST::Node.new(
+      type,
+      children.flatten,
+      :line => @line
+    )
+  end
+
+  ##
+  # Returns the next token from the lexer.
+  #
+  # @return [Array]
+  #
+  def next_token
+    type, value, line = @tokens.shift
+
+    @line = line if line
+
+    return type ? [type, value] : [false, false]
+  end
+
+  ##
+  # @param [Fixnum] type The type of token the error occured on.
+  # @param [String] value The value of the token.
+  # @param [Array] stack The current stack of parsed nodes.
+  # @raise [Racc::ParseError]
+  #
+  def on_error(type, value, stack)
+    name  = token_to_str(type)
+    index = @line - 1
+    lines = ''
+
+    # Show up to 5 lines before and after the offending line (if they exist).
+    (-5..5).each do |offset|
+      line   = @lines[index + offset]
+      number = @line + offset
+
+      if line and number > 0
+        if offset == 0
+          prefix = '=> '
+        else
+          prefix = '   '
+        end
+
+        lines << "#{prefix}#{number}: #{line.strip}\n"
+      end
+    end
+
+    raise Racc::ParseError, <<-EOF
+Unexpected #{name} with value #{value.inspect} on line #{@line}:
+
+#{lines}
+    EOF
+  end
+
+  ##
+  # Parses the supplied string and returns the AST.
+  #
+  # @example
+  #  parser = Oga::Parser.new
+  #  ast    = parser.parse('<foo>bar</foo>')
+  #
+  # @param [String] string
+  # @return [Oga::AST::Node]
+  #
+  def parse(string)
+    @lines  = string.lines
+    @tokens = @lexer.lex(string)
+    ast     = do_parse
+
+    reset
+
+    return ast
+  end
+
+# vim: set ft=racc:
+##### State transition tables begin ###
+
+racc_action_table = [
+    16,    40,    16,    10,    24,    37,    11,    22,    12,    28,
+    14,    23,    21,    45,    31,    15,    16,    10,    44,    28,
+    11,    43,    12,    36,    14,    35,    16,    10,    34,    15,
+    11,    41,    12,    42,    14,    33,    16,    10,    17,    15,
+    11,    46,    12,   nil,    14,    29,    30,    19,    20,    15 ]
+
+racc_action_check = [
+    15,    28,    38,    38,    12,    24,    38,    11,    38,    13,
+    38,    12,    11,    38,    15,    38,     2,     2,    35,    26,
+     2,    35,     2,    22,     2,    20,    25,    25,    20,     2,
+    25,    30,    25,    32,    25,    17,     0,     0,     1,    25,
+     0,    44,     0,   nil,     0,    14,    14,    10,    10,     0 ]
+
+racc_action_pointer = [
+    33,    38,    13,   nil,   nil,   nil,   nil,   nil,   nil,   nil,
+    42,     4,     1,    -6,    33,    -3,   nil,    35,   nil,   nil,
+    23,   nil,    15,   nil,    -5,    23,     4,   nil,    -1,   nil,
+    19,   nil,    16,   nil,   nil,    16,   nil,   nil,    -1,   nil,
+   nil,   nil,   nil,   nil,    36,   nil,   nil ]
+
+racc_action_default = [
+    -2,   -32,    -1,    -4,    -6,    -7,    -8,    -9,   -10,   -11,
+   -32,   -32,   -32,   -24,   -32,   -32,   -31,   -32,    -3,   -12,
+   -32,   -16,   -32,   -18,   -32,    -5,   -23,   -26,   -27,   -21,
+   -32,   -29,   -32,    47,   -13,   -32,   -17,   -19,   -32,   -25,
+   -28,   -22,   -30,   -14,   -32,   -20,   -15 ]
+
+racc_goto_table = [
+    18,     2,    27,    32,    25,    26,     1,   nil,   nil,   nil,
+   nil,   nil,   nil,   nil,   nil,    39,   nil,   nil,   nil,   nil,
+   nil,   nil,   nil,   nil,   nil,   nil,    38,   nil,   nil,   nil,
+   nil,   nil,   nil,   nil,   nil,   nil,    18 ]
+
+racc_goto_check = [
+     3,     2,    13,     8,    11,    12,     1,   nil,   nil,   nil,
+   nil,   nil,   nil,   nil,   nil,    13,   nil,   nil,   nil,   nil,
+   nil,   nil,   nil,   nil,   nil,   nil,     2,   nil,   nil,   nil,
+   nil,   nil,   nil,   nil,   nil,   nil,     3 ]
+
+racc_goto_pointer = [
+   nil,     6,     1,    -2,   nil,   nil,   nil,   nil,   -12,   nil,
+   nil,    -9,    -8,   -11 ]
+
+racc_goto_default = [
+   nil,   nil,   nil,     3,     4,     5,     6,     7,     8,     9,
+    13,   nil,   nil,   nil ]
+
+racc_reduce_table = [
+  0, 0, :racc_error,
+  1, 19, :_reduce_1,
+  0, 19, :_reduce_2,
+  2, 20, :_reduce_3,
+  1, 20, :_reduce_4,
+  0, 20, :_reduce_5,
+  1, 21, :_reduce_none,
+  1, 21, :_reduce_none,
+  1, 21, :_reduce_none,
+  1, 21, :_reduce_none,
+  1, 21, :_reduce_none,
+  1, 21, :_reduce_none,
+  2, 22, :_reduce_12,
+  3, 22, :_reduce_13,
+  4, 22, :_reduce_14,
+  5, 22, :_reduce_15,
+  2, 23, :_reduce_16,
+  3, 23, :_reduce_17,
+  2, 24, :_reduce_18,
+  3, 24, :_reduce_19,
+  4, 25, :_reduce_20,
+  2, 28, :_reduce_21,
+  3, 28, :_reduce_22,
+  1, 29, :_reduce_23,
+  0, 29, :_reduce_24,
+  2, 30, :_reduce_25,
+  1, 30, :_reduce_26,
+  1, 31, :_reduce_27,
+  2, 31, :_reduce_28,
+  2, 27, :_reduce_29,
+  3, 27, :_reduce_30,
+  1, 26, :_reduce_31 ]
+
+racc_reduce_n = 32
+
+racc_shift_n = 47
+
+racc_token_table = {
+  false => 0,
+  :error => 1,
+  :T_STRING => 2,
+  :T_TEXT => 3,
+  :T_DOCTYPE_START => 4,
+  :T_DOCTYPE_END => 5,
+  :T_DOCTYPE_TYPE => 6,
+  :T_CDATA_START => 7,
+  :T_CDATA_END => 8,
+  :T_COMMENT_START => 9,
+  :T_COMMENT_END => 10,
+  :T_ELEM_START => 11,
+  :T_ELEM_NAME => 12,
+  :T_ELEM_NS => 13,
+  :T_ELEM_END => 14,
+  :T_ATTR => 15,
+  :T_XML_DECL_START => 16,
+  :T_XML_DECL_END => 17 }
+
+racc_nt_base = 18
+
+racc_use_result_var = false
+
+Racc_arg = [
+  racc_action_table,
+  racc_action_check,
+  racc_action_default,
+  racc_action_pointer,
+  racc_goto_table,
+  racc_goto_check,
+  racc_goto_default,
+  racc_goto_pointer,
+  racc_nt_base,
+  racc_reduce_table,
+  racc_token_table,
+  racc_shift_n,
+  racc_reduce_n,
+  racc_use_result_var ]
+
+Racc_token_to_s_table = [
+  "$end",
+  "error",
+  "T_STRING",
+  "T_TEXT",
+  "T_DOCTYPE_START",
+  "T_DOCTYPE_END",
+  "T_DOCTYPE_TYPE",
+  "T_CDATA_START",
+  "T_CDATA_END",
+  "T_COMMENT_START",
+  "T_COMMENT_END",
+  "T_ELEM_START",
+  "T_ELEM_NAME",
+  "T_ELEM_NS",
+  "T_ELEM_END",
+  "T_ATTR",
+  "T_XML_DECL_START",
+  "T_XML_DECL_END",
+  "$start",
+  "document",
+  "expressions",
+  "expression",
+  "doctype",
+  "cdata",
+  "comment",
+  "element",
+  "text",
+  "xmldecl",
+  "element_open",
+  "attributes",
+  "attributes_",
+  "attribute" ]
+
+Racc_debug_parser = false
+
+##### State transition tables end #####
+
+# reduce 0 omitted
+
+def _reduce_1(val, _values)
+ s(:document, val[0]) 
+end
+
+def _reduce_2(val, _values)
+ s(:document) 
+end
+
+def _reduce_3(val, _values)
+ val.compact 
+end
+
+def _reduce_4(val, _values)
+ val[0] 
+end
+
+def _reduce_5(val, _values)
+ nil 
+end
+
+# reduce 6 omitted
+
+# reduce 7 omitted
+
+# reduce 8 omitted
+
+# reduce 9 omitted
+
+# reduce 10 omitted
+
+# reduce 11 omitted
+
+def _reduce_12(val, _values)
+ s(:doctype) 
+end
+
+def _reduce_13(val, _values)
+        s(:doctype, val[1])
+      
+end
+
+def _reduce_14(val, _values)
+        s(:doctype, val[1], val[2])
+      
+end
+
+def _reduce_15(val, _values)
+        s(:doctype, val[1], val[2], val[3])
+      
+end
+
+def _reduce_16(val, _values)
+ s(:cdata) 
+end
+
+def _reduce_17(val, _values)
+ s(:cdata, val[1]) 
+end
+
+def _reduce_18(val, _values)
+ s(:comment) 
+end
+
+def _reduce_19(val, _values)
+ s(:comment, val[1]) 
+end
+
+def _reduce_20(val, _values)
+        s(:element, val[0], val[1], val[2])
+      
+end
+
+def _reduce_21(val, _values)
+ [nil, val[1]] 
+end
+
+def _reduce_22(val, _values)
+ [val[1], val[2]] 
+end
+
+def _reduce_23(val, _values)
+ s(:attributes, val[0]) 
+end
+
+def _reduce_24(val, _values)
+ nil 
+end
+
+def _reduce_25(val, _values)
+ val 
+end
+
+def _reduce_26(val, _values)
+ val 
+end
+
+def _reduce_27(val, _values)
+ s(:attribute, val[0]) 
+end
+
+def _reduce_28(val, _values)
+ s(:attribute, val[0], val[1]) 
+end
+
+def _reduce_29(val, _values)
+ s(:xml_decl) 
+end
+
+def _reduce_30(val, _values)
+ s(:xml_decl, val[1]) 
+end
+
+def _reduce_31(val, _values)
+ s(:text, val[0]) 
+end
+
+def _reduce_none(val, _values)
+  val[0]
+end
+
+    end   # class Parser
+    end   # module XML
+  end   # module Oga
--- a/lib/oga/xml/parser.y
+++ b/lib/oga/xml/parser.y
@ -5,9 +5,9 @@
 # It requires every tag to have a closing tag. As such you'll need to enable
 # HTML parsing mode when parsing HTML. This can be done as following:
 #
-#     parser = Oga::Parser.new(:html => true)
+#     parser = Oga::XML::Parser.new(:html => true)
 #
-class Oga::Parser
+class Oga::XML::Parser

 token T_STRING T_TEXT
 token T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE
--- a/spec/oga/lexer/cdata_spec.rb
+++ b/spec/oga/lexer/cdata_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-describe Oga::Lexer do
+describe Oga::XML::Lexer do
  context 'cdata tags' do
    example 'lex a cdata tag' do
      lex('<![CDATA[foo]]>').should == [
--- a/spec/oga/lexer/comments_spec.rb
+++ b/spec/oga/lexer/comments_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-describe Oga::Lexer do
+describe Oga::XML::Lexer do
  context 'comments' do
    example 'lex a comment' do
      lex('<!-- foo -->').should == [
--- a/spec/oga/lexer/doctype_spec.rb
+++ b/spec/oga/lexer/doctype_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-describe Oga::Lexer do
+describe Oga::XML::Lexer do
  context 'doctypes' do
    example 'lex the HTML5 doctype' do
      lex('<!DOCTYPE html>').should == [
--- a/spec/oga/lexer/documents_spec.rb
+++ b/spec/oga/lexer/documents_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-describe Oga::Lexer do
+describe Oga::XML::Lexer do
  context 'HTML documents' do
    example 'lex a basic HTML document' do
      html = <<-EOF
--- a/spec/oga/lexer/elements_spec.rb
+++ b/spec/oga/lexer/elements_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-describe Oga::Lexer do
+describe Oga::XML::Lexer do
  context 'elements' do
    example 'lex an opening element' do
      lex('<p>').should == [
--- a/spec/oga/lexer/general_spec.rb
+++ b/spec/oga/lexer/general_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-describe Oga::Lexer do
+describe Oga::XML::Lexer do
  context 'regular text' do
    example 'lex regular text' do
      lex('hello').should == [[:T_TEXT, 'hello', 1]]
--- a/spec/oga/lexer/html_void_elements_spec.rb
+++ b/spec/oga/lexer/html_void_elements_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-describe Oga::Lexer do
+describe Oga::XML::Lexer do
  context 'HTML void elements' do
    example 'lex a void element that omits the closing /' do
      lex('<link>', :html => true).should == [
--- a/spec/oga/lexer/xml_declaration_spec.rb
+++ b/spec/oga/lexer/xml_declaration_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-describe Oga::Lexer do
+describe Oga::XML::Lexer do
  context 'XML declaration tags' do
    example 'lex a start tag' do
      lex('<?xml').should == [[:T_XML_DECL_START, nil, 1]]
--- a/spec/oga/parser/cdata_spec.rb
+++ b/spec/oga/parser/cdata_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-describe Oga::Parser do
+describe Oga::XML::Parser do
  context 'cdata tags' do
    example 'parse a cdata tag' do
      parse('<![CDATA[foo]]>').should == s(:document, s(:cdata, 'foo'))
--- a/spec/oga/parser/comments_spec.rb
+++ b/spec/oga/parser/comments_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-describe Oga::Parser do
+describe Oga::XML::Parser do
  context 'comments' do
    example 'parse an empty comment' do
      parse('<!---->').should == s(:document, s(:comment))
--- a/spec/oga/parser/doctype_spec.rb
+++ b/spec/oga/parser/doctype_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-describe Oga::Parser do
+describe Oga::XML::Parser do
  context 'doctypes' do
    example 'parse a doctype' do
      parse('<!DOCTYPE html>').should == s(:document, s(:doctype))
--- a/spec/oga/parser/documents_spec.rb
+++ b/spec/oga/parser/documents_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-describe Oga::Parser do
+describe Oga::XML::Parser do
  context 'HTML documents' do
    example 'parse a basic HTML document' do
      html = <<-EOF
--- a/spec/oga/parser/elements_spec.rb
+++ b/spec/oga/parser/elements_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-describe Oga::Parser do
+describe Oga::XML::Parser do
  context 'elements' do
    example 'parse an empty element' do
      parse('<p></p>').should == s(
--- a/spec/oga/parser/general_spec.rb
+++ b/spec/oga/parser/general_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-describe Oga::Parser do
+describe Oga::XML::Parser do
  example 'parse regular text' do
    parse('foo').should == s(:document, s(:text, 'foo'))
  end
--- a/spec/oga/parser/html_void_elements_spec.rb
+++ b/spec/oga/parser/html_void_elements_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-describe Oga::Parser do
+describe Oga::XML::Parser do
  context 'HTML void elements' do
    example 'parse a void element that omits the closing /' do
      parse('<link>', :html => true).should == s(
--- a/spec/oga/parser/xml_declaration_spec.rb
+++ b/spec/oga/parser/xml_declaration_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-describe Oga::Parser do
+describe Oga::XML::Parser do
  context 'XML declaration tags' do
    example 'lex an XML declaration tag' do
      parse('<?xml hello ?>').should == s(
--- a/spec/support/parsing.rb
+++ b/spec/support/parsing.rb
@ -19,7 +19,7 @@ module Oga
    # @return [Array]
    #
    def lex(input, options = {})
-      return Oga::Lexer.new(options).lex(input)
+      return Oga::XML::Lexer.new(options).lex(input)
    end

    ##
@ -30,7 +30,7 @@ module Oga
    # @return [Oga::AST::Node]
    #
    def parse(input, options = {})
-      return Oga::Parser.new(options).parse(input)
+      return Oga::XML::Parser.new(options).parse(input)
    end
  end # ParsingHelpers
 end # Oga