Namespaced the lexer/parser under Oga::XML.

With the upcoming XPath and CSS selector lexers/parsers it will be confusing to keep these in the root namespace.
2014-03-25 09:34:38 +01:00 · 2014-03-25 09:34:38 +01:00 · eae13d21ed
parent 2259061c89
commit eae13d21ed
28 changed files with 2049 additions and 537 deletions
--- a/6
+++ b/6
@ -5,10 +5,10 @@ require 'cliver'
 GEMSPEC = Gem::Specification.load('oga.gemspec')
-LEXER_INPUT  = 'lib/oga/lexer.rl'
+LEXER_INPUT  = 'lib/oga/xml/lexer.rl'
-LEXER_OUTPUT = 'lib/oga/lexer.rb'
+LEXER_OUTPUT = 'lib/oga/xml/lexer.rb'
-HTML_PARSER = 'lib/oga/parser.rb'
+HTML_PARSER = 'lib/oga/xml/parser.rb'
 GENERATED_FILES = ['coverage', 'yardoc', LEXER_OUTPUT, HTML_PARSER]
--- a/benchmark/lexer/bench_cdata.rb
+++ b/benchmark/lexer/bench_cdata.rb
@ -5,7 +5,7 @@ string = 'Hello, how are you doing today?'
 small  = "<![CDATA[#{string}]]>"
 medium = "<![CDATA[#{string * 1_000}]]>"
 large  = "<![CDATA[#{string * 10_000}]]>"
-lexer  = Oga::Lexer.new
+lexer  = Oga::XML::Lexer.new
 Benchmark.ips do |bench|
  bench.report 'CDATA with a small body' do
--- a/benchmark/lexer/bench_element.rb
+++ b/benchmark/lexer/bench_element.rb
@ -4,7 +4,7 @@ require 'benchmark/ips'
 simple     = '<p>Hello world</p>'
 attributes = '<p class="foo">Hello world</p>'
 nested     = '<p>Hello<strong>world</strong></p>'
-lexer      = Oga::Lexer.new
+lexer      = Oga::XML::Lexer.new
 Benchmark.ips do |bench|
  bench.report 'text only' do
--- a/benchmark/lexer/bench_html.rb
+++ b/benchmark/lexer/bench_html.rb
@ -2,7 +2,7 @@ require_relative '../../lib/oga'
 require 'benchmark/ips'
 html  = File.read(File.expand_path('../../fixtures/hrs.html', __FILE__))
-lexer = Oga::Lexer.new(:html => true)
+lexer = Oga::XML::Lexer.new(:html => true)
 Benchmark.ips do |bench|
  bench.report 'lex HTML' do
--- a/benchmark/lexer/bench_html_time.rb
+++ b/benchmark/lexer/bench_html_time.rb
@ -2,7 +2,7 @@ require_relative '../../lib/oga'
 require 'benchmark'
 html  = File.read(File.expand_path('../../fixtures/hrs.html', __FILE__))
-lexer = Oga::Lexer.new(:html => true)
+lexer = Oga::XML::Lexer.new(:html => true)
 Benchmark.bmbm(20) do |bench|
  bench.report 'lex HTML' do
--- a/lib/oga.rb
+++ b/lib/oga.rb
@ -1,5 +1,5 @@
 require 'ast'
 require_relative 'oga/ast/node'
-require_relative 'oga/lexer'
+require_relative 'oga/xml/lexer'
-require_relative 'oga/parser'
+require_relative 'oga/xml/parser'
--- a/lib/oga/lexer.rl
+++ b/lib/oga/lexer.rl
@ -1,508 +0,0 @@
 %%machine lexer; # %
 module Oga
  ##
  # Low level lexer that supports both XML and HTML (using an extra option). To
  # lex HTML input set the `:html` option to `true` when creating an instance
  # of the lexer:
  #
  #     lexer = Oga::Lexer.new(:html => true)
  #
  # @!attribute [r] html
  #  @return [TrueClass|FalseClass]
  #
  class Lexer
    %% write data; # %
    attr_reader :html
    ##
    # Names of the HTML void elements that should be handled when HTML lexing
    # is enabled.
    #
    # @return [Array]
    #
    HTML_VOID_ELEMENTS = [
      'area',
      'base',
      'br',
      'col',
      'command',
      'embed',
      'hr',
      'img',
      'input',
      'keygen',
      'link',
      'meta',
      'param',
      'source',
      'track',
      'wbr'
    ]
    # Lazy way of forwarding instance method calls used internally by Ragel to
    # their corresponding class methods.
    private_methods.grep(/^_lexer_/).each do |name|
      define_method(name) do
        return self.class.send(name)
      end
      private(name)
    end
    ##
    # @param [Hash] options
    #
    # @option options [Symbol] :html When set to `true` the lexer will treat
    #  the input as HTML instead of SGML/XML. This makes it possible to lex
    #  HTML void elements such as `<link href="">`.
    #
    def initialize(options = {})
      options.each do |key, value|
        instance_variable_set("@#{key}", value) if respond_to?(key)
      end
      reset
    end
    ##
    # Resets the internal state of the lexer. Typically you don't need to call
    # this method yourself as its called by #lex after lexing a given String.
    #
    def reset
      @line     = 1
      @data     = nil
      @ts       = nil
      @te       = nil
      @tokens   = []
      @stack    = []
      @top      = 0
      @elements = []
      @buffer_start_position = nil
    end
    ##
    # Lexes the supplied String and returns an Array of tokens. Each token is
    # an Array in the following format:
    #
    #     [TYPE, VALUE]
    #
    # The type is a symbol, the value is either nil or a String.
    #
    # @param [String] data The string to lex.
    # @return [Array]
    #
    def lex(data)
      @data       = data.unpack('U*')
      lexer_start = self.class.lexer_start
      eof         = data.length
      %% write init;
      %% write exec;
      tokens = @tokens
      reset
      return tokens
    end
    ##
    # @return [TrueClass|FalseClass]
    #
    def html?
      return !!html
    end
    private
    ##
    # @param [Fixnum] amount The amount of lines to advance.
    #
    def advance_line(amount = 1)
      @line += amount
    end
    ##
    # Emits a token who's value is based on the supplied start/stop position.
    #
    # @param [Symbol] type The token type.
    # @param [Fixnum] start
    # @param [Fixnum] stop
    #
    # @see #text
    # @see #add_token
    #
    def t(type, start = @ts, stop = @te)
      value = text(start, stop)
      add_token(type, value)
    end
    ##
    # Returns the text of the current buffer based on the supplied start and
    # stop position.
    #
    # By default `@ts` and `@te` are used as the start/stop position.
    #
    # @param [Fixnum] start
    # @param [Fixnum] stop
    # @return [String]
    #
    def text(start = @ts, stop = @te)
      return @data[start...stop].pack('U*')
    end
    ##
    # Adds a token with the given type and value to the list.
    #
    # @param [Symbol] type The token type.
    # @param [String] value The token value.
    #
    def add_token(type, value = nil)
      token = [type, value, @line]
      @tokens << token
    end
    ##
    # Enables buffering starting at the given position.
    #
    # @param [Fixnum] position The start position of the buffer, set to `@te`
    #  by default.
    #
    def start_buffer(position = @te)
      @buffer_start_position = position
    end
    ##
    # Returns `true` if we're currently buffering.
    #
    # @return [TrueClass|FalseClass]
    #
    def buffering?
      return !!@buffer_start_position
    end
    ##
    # Emits the current buffer if we have any. The current line number is
    # advanced based on the amount of newlines in the buffer.
    #
    # @param [Fixnum] position The end position of the buffer, set to `@ts` by
    #  default.
    #
    # @param [Symbol] type The type of node to emit.
    #
    def emit_buffer(position = @ts, type = :T_TEXT)
      return unless @buffer_start_position
      content = text(@buffer_start_position, position)
      unless content.empty?
        add_token(type, content)
        lines = content.count("\n")
        advance_line(lines) if lines > 0
      end
      @buffer_start_position = nil
    end
    ##
    # Returns the name of the element we're currently in.
    #
    # @return [String]
    #
    def current_element
      return @elements.last
    end
    %%{
      # Use instance variables for `ts` and friends.
      access @;
      getkey (@data[p] || 0);
      newline    = '\n' | '\r\n';
      whitespace = [ \t];
      # Strings
      #
      # Strings in HTML can either be single or double quoted. If a string
      # starts with one of these quotes it must be closed with the same type of
      # quote.
      dquote = '"';
      squote = "'";
      action start_string_dquote {
        start_buffer
        fcall string_dquote;
      }
      action start_string_squote {
        start_buffer
        fcall string_squote;
      }
      # Machine for processing double quoted strings.
      string_dquote := |*
        dquote => {
          emit_buffer(@ts, :T_STRING)
          fret;
        };
        any;
      *|;
      # Machine for processing single quoted strings.
      string_squote := |*
        squote => {
          emit_buffer(@ts, :T_STRING)
          fret;
        };
        any;
      *|;
      # DOCTYPES
      #
      # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
      #
      # These rules support the 3 flavours of doctypes:
      #
      # 1. Normal doctypes, as introduced in the HTML5 specification.
      # 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
      # 3. Legacy doctypes
      #
      doctype_start = '<!DOCTYPE'i whitespace+ 'HTML'i;
      action start_doctype {
        emit_buffer
        add_token(:T_DOCTYPE_START)
        fcall doctype;
      }
      # Machine for processing doctypes. Doctype values such as the public and
      # system IDs are treated as T_STRING tokens.
      doctype := |*
        'PUBLIC' | 'SYSTEM' => { t(:T_DOCTYPE_TYPE) };
        # Lex the public/system IDs as regular strings.
        dquote => start_string_dquote;
        squote => start_string_squote;
        # Whitespace inside doctypes is ignored since there's no point in
        # including it.
        whitespace;
        '>' => {
          add_token(:T_DOCTYPE_END)
          fret;
        };
      *|;
      # CDATA
      #
      # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
      #
      # CDATA tags are broken up into 3 parts: the start, the content and the
      # end tag.
      #
      # In HTML CDATA tags have no meaning/are not supported. Oga does
      # support them but treats their contents as plain text.
      #
      cdata_start = '<![CDATA[';
      cdata_end   = ']]>';
      action start_cdata {
        emit_buffer
        add_token(:T_CDATA_START)
        start_buffer
        fcall cdata;
      }
      # Machine that for processing the contents of CDATA tags. Everything
      # inside a CDATA tag is treated as plain text.
      cdata := |*
        cdata_end => {
          emit_buffer
          add_token(:T_CDATA_END)
          fret;
        };
        any;
      *|;
      # Comments
      #
      # http://www.w3.org/TR/html-markup/syntax.html#comments
      #
      # Comments are lexed into 3 parts: the start tag, the content and the end
      # tag.
      #
      # Unlike the W3 specification these rules *do* allow character sequences
      # such as `--` and `->`. Putting extra checks in for these sequences
      # would actually make the rules/actions more complex.
      #
      comment_start = '<!--';
      comment_end   = '-->';
      action start_comment {
        emit_buffer
        add_token(:T_COMMENT_START)
        start_buffer
        fcall comment;
      }
      # Machine used for processing the contents of a comment. Everything
      # inside a comment is treated as plain text (similar to CDATA tags).
      comment := |*
        comment_end => {
          emit_buffer
          add_token(:T_COMMENT_END)
          fret;
        };
        any;
      *|;
      # XML declaration tags
      #
      # http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
      #
      xml_decl_start = '<?xml';
      xml_decl_end   = '?>';
      action start_xml_decl {
        emit_buffer
        add_token(:T_XML_DECL_START)
        start_buffer
        fcall xml_decl;
      }
      # Machine that processes the contents of an XML declaration tag.
      xml_decl := |*
        xml_decl_end => {
          emit_buffer
          add_token(:T_XML_DECL_END)
          fret;
        };
        any;
      *|;
      # Elements
      #
      # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
      #
      # Action that creates the tokens for the opening tag, name and namespace
      # (if any). Remaining work is delegated to a dedicated machine.
      action start_element {
        emit_buffer
        add_token(:T_ELEM_START)
        # Add the element name. If the name includes a namespace we'll break
        # the name up into two separate tokens.
        name = text(@ts + 1)
        if name.include?(':')
          ns, name = name.split(':')
          add_token(:T_ELEM_NS, ns)
        end
        @elements << name
        add_token(:T_ELEM_NAME, name)
        fcall element_head;
      }
      element_name  = [a-zA-Z0-9\-_:]+;
      element_start = '<' element_name;
      # Machine used for processing the characters inside a element head. An
      # element head is everything between `<NAME` (where NAME is the element
      # name) and `>`.
      #
      # For example, in `<p foo="bar">` the element head is ` foo="bar"`.
      #
      element_head := |*
        whitespace | '=';
        newline => { advance_line };
        # Attribute names.
        element_name => { t(:T_ATTR) };
        # Attribute values.
        dquote => start_string_dquote;
        squote => start_string_squote;
        # The closing character of the open tag.
        ('>' | '/') => {
          fhold;
          fret;
        };
      *|;
      main := |*
        element_start  => start_element;
        doctype_start  => start_doctype;
        cdata_start    => start_cdata;
        comment_start  => start_comment;
        xml_decl_start => start_xml_decl;
        # Enter the body of the tag. If HTML mode is enabled and the current
        # element is a void element we'll close it and bail out.
        '>' => {
          if html? and HTML_VOID_ELEMENTS.include?(current_element)
            add_token(:T_ELEM_END, nil)
            @elements.pop
          end
        };
        # Regular closing tags.
        '</' element_name '>' => {
          emit_buffer
          add_token(:T_ELEM_END, nil)
          @elements.pop
        };
        # Self closing elements that are not handled by the HTML mode.
        '/>' => {
          add_token(:T_ELEM_END, nil)
          @elements.pop
        };
        # Note that this rule should be declared at the very bottom as it will
        # otherwise take precedence over the other rules.
        any => {
          # First character, start buffering (unless we already are buffering).
          start_buffer(@ts) unless buffering?
          # EOF, emit the text buffer.
          if @te == eof
            emit_buffer(@te)
          end
        };
      *|;
    }%%
  end # Lexer
 end # Oga
--- a/lib/oga/xml/lexer.rb
+++ b/lib/oga/xml/lexer.rb
--- a/lib/oga/xml/lexer.rl
+++ b/lib/oga/xml/lexer.rl
@ -0,0 +1,510 @@
 %%machine lexer; # %
 module Oga
  module XML
    ##
    # Low level lexer that supports both XML and HTML (using an extra option). To
    # lex HTML input set the `:html` option to `true` when creating an instance
    # of the lexer:
    #
    #     lexer = Oga::Lexer.new(:html => true)
    #
    # @!attribute [r] html
    #  @return [TrueClass|FalseClass]
    #
    class Lexer
      %% write data; # %
      attr_reader :html
      ##
      # Names of the HTML void elements that should be handled when HTML lexing
      # is enabled.
      #
      # @return [Array]
      #
      HTML_VOID_ELEMENTS = [
        'area',
        'base',
        'br',
        'col',
        'command',
        'embed',
        'hr',
        'img',
        'input',
        'keygen',
        'link',
        'meta',
        'param',
        'source',
        'track',
        'wbr'
      ]
      # Lazy way of forwarding instance method calls used internally by Ragel to
      # their corresponding class methods.
      private_methods.grep(/^_lexer_/).each do |name|
        define_method(name) do
          return self.class.send(name)
        end
        private(name)
      end
      ##
      # @param [Hash] options
      #
      # @option options [Symbol] :html When set to `true` the lexer will treat
      #  the input as HTML instead of SGML/XML. This makes it possible to lex
      #  HTML void elements such as `<link href="">`.
      #
      def initialize(options = {})
        options.each do |key, value|
          instance_variable_set("@#{key}", value) if respond_to?(key)
        end
        reset
      end
      ##
      # Resets the internal state of the lexer. Typically you don't need to call
      # this method yourself as its called by #lex after lexing a given String.
      #
      def reset
        @line     = 1
        @data     = nil
        @ts       = nil
        @te       = nil
        @tokens   = []
        @stack    = []
        @top      = 0
        @elements = []
        @buffer_start_position = nil
      end
      ##
      # Lexes the supplied String and returns an Array of tokens. Each token is
      # an Array in the following format:
      #
      #     [TYPE, VALUE]
      #
      # The type is a symbol, the value is either nil or a String.
      #
      # @param [String] data The string to lex.
      # @return [Array]
      #
      def lex(data)
        @data       = data.unpack('U*')
        lexer_start = self.class.lexer_start
        eof         = data.length
        %% write init;
        %% write exec;
        tokens = @tokens
        reset
        return tokens
      end
      ##
      # @return [TrueClass|FalseClass]
      #
      def html?
        return !!html
      end
      private
      ##
      # @param [Fixnum] amount The amount of lines to advance.
      #
      def advance_line(amount = 1)
        @line += amount
      end
      ##
      # Emits a token who's value is based on the supplied start/stop position.
      #
      # @param [Symbol] type The token type.
      # @param [Fixnum] start
      # @param [Fixnum] stop
      #
      # @see #text
      # @see #add_token
      #
      def t(type, start = @ts, stop = @te)
        value = text(start, stop)
        add_token(type, value)
      end
      ##
      # Returns the text of the current buffer based on the supplied start and
      # stop position.
      #
      # By default `@ts` and `@te` are used as the start/stop position.
      #
      # @param [Fixnum] start
      # @param [Fixnum] stop
      # @return [String]
      #
      def text(start = @ts, stop = @te)
        return @data[start...stop].pack('U*')
      end
      ##
      # Adds a token with the given type and value to the list.
      #
      # @param [Symbol] type The token type.
      # @param [String] value The token value.
      #
      def add_token(type, value = nil)
        token = [type, value, @line]
        @tokens << token
      end
      ##
      # Enables buffering starting at the given position.
      #
      # @param [Fixnum] position The start position of the buffer, set to `@te`
      #  by default.
      #
      def start_buffer(position = @te)
        @buffer_start_position = position
      end
      ##
      # Returns `true` if we're currently buffering.
      #
      # @return [TrueClass|FalseClass]
      #
      def buffering?
        return !!@buffer_start_position
      end
      ##
      # Emits the current buffer if we have any. The current line number is
      # advanced based on the amount of newlines in the buffer.
      #
      # @param [Fixnum] position The end position of the buffer, set to `@ts` by
      #  default.
      #
      # @param [Symbol] type The type of node to emit.
      #
      def emit_buffer(position = @ts, type = :T_TEXT)
        return unless @buffer_start_position
        content = text(@buffer_start_position, position)
        unless content.empty?
          add_token(type, content)
          lines = content.count("\n")
          advance_line(lines) if lines > 0
        end
        @buffer_start_position = nil
      end
      ##
      # Returns the name of the element we're currently in.
      #
      # @return [String]
      #
      def current_element
        return @elements.last
      end
      %%{
        # Use instance variables for `ts` and friends.
        access @;
        getkey (@data[p] || 0);
        newline    = '\n' | '\r\n';
        whitespace = [ \t];
        # Strings
        #
        # Strings in HTML can either be single or double quoted. If a string
        # starts with one of these quotes it must be closed with the same type of
        # quote.
        dquote = '"';
        squote = "'";
        action start_string_dquote {
          start_buffer
          fcall string_dquote;
        }
        action start_string_squote {
          start_buffer
          fcall string_squote;
        }
        # Machine for processing double quoted strings.
        string_dquote := |*
          dquote => {
            emit_buffer(@ts, :T_STRING)
            fret;
          };
          any;
        *|;
        # Machine for processing single quoted strings.
        string_squote := |*
          squote => {
            emit_buffer(@ts, :T_STRING)
            fret;
          };
          any;
        *|;
        # DOCTYPES
        #
        # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
        #
        # These rules support the 3 flavours of doctypes:
        #
        # 1. Normal doctypes, as introduced in the HTML5 specification.
        # 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
        # 3. Legacy doctypes
        #
        doctype_start = '<!DOCTYPE'i whitespace+ 'HTML'i;
        action start_doctype {
          emit_buffer
          add_token(:T_DOCTYPE_START)
          fcall doctype;
        }
        # Machine for processing doctypes. Doctype values such as the public and
        # system IDs are treated as T_STRING tokens.
        doctype := |*
          'PUBLIC' | 'SYSTEM' => { t(:T_DOCTYPE_TYPE) };
          # Lex the public/system IDs as regular strings.
          dquote => start_string_dquote;
          squote => start_string_squote;
          # Whitespace inside doctypes is ignored since there's no point in
          # including it.
          whitespace;
          '>' => {
            add_token(:T_DOCTYPE_END)
            fret;
          };
        *|;
        # CDATA
        #
        # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
        #
        # CDATA tags are broken up into 3 parts: the start, the content and the
        # end tag.
        #
        # In HTML CDATA tags have no meaning/are not supported. Oga does
        # support them but treats their contents as plain text.
        #
        cdata_start = '<![CDATA[';
        cdata_end   = ']]>';
        action start_cdata {
          emit_buffer
          add_token(:T_CDATA_START)
          start_buffer
          fcall cdata;
        }
        # Machine that for processing the contents of CDATA tags. Everything
        # inside a CDATA tag is treated as plain text.
        cdata := |*
          cdata_end => {
            emit_buffer
            add_token(:T_CDATA_END)
            fret;
          };
          any;
        *|;
        # Comments
        #
        # http://www.w3.org/TR/html-markup/syntax.html#comments
        #
        # Comments are lexed into 3 parts: the start tag, the content and the end
        # tag.
        #
        # Unlike the W3 specification these rules *do* allow character sequences
        # such as `--` and `->`. Putting extra checks in for these sequences
        # would actually make the rules/actions more complex.
        #
        comment_start = '<!--';
        comment_end   = '-->';
        action start_comment {
          emit_buffer
          add_token(:T_COMMENT_START)
          start_buffer
          fcall comment;
        }
        # Machine used for processing the contents of a comment. Everything
        # inside a comment is treated as plain text (similar to CDATA tags).
        comment := |*
          comment_end => {
            emit_buffer
            add_token(:T_COMMENT_END)
            fret;
          };
          any;
        *|;
        # XML declaration tags
        #
        # http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
        #
        xml_decl_start = '<?xml';
        xml_decl_end   = '?>';
        action start_xml_decl {
          emit_buffer
          add_token(:T_XML_DECL_START)
          start_buffer
          fcall xml_decl;
        }
        # Machine that processes the contents of an XML declaration tag.
        xml_decl := |*
          xml_decl_end => {
            emit_buffer
            add_token(:T_XML_DECL_END)
            fret;
          };
          any;
        *|;
        # Elements
        #
        # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
        #
        # Action that creates the tokens for the opening tag, name and namespace
        # (if any). Remaining work is delegated to a dedicated machine.
        action start_element {
          emit_buffer
          add_token(:T_ELEM_START)
          # Add the element name. If the name includes a namespace we'll break
          # the name up into two separate tokens.
          name = text(@ts + 1)
          if name.include?(':')
            ns, name = name.split(':')
            add_token(:T_ELEM_NS, ns)
          end
          @elements << name
          add_token(:T_ELEM_NAME, name)
          fcall element_head;
        }
        element_name  = [a-zA-Z0-9\-_:]+;
        element_start = '<' element_name;
        # Machine used for processing the characters inside a element head. An
        # element head is everything between `<NAME` (where NAME is the element
        # name) and `>`.
        #
        # For example, in `<p foo="bar">` the element head is ` foo="bar"`.
        #
        element_head := |*
          whitespace | '=';
          newline => { advance_line };
          # Attribute names.
          element_name => { t(:T_ATTR) };
          # Attribute values.
          dquote => start_string_dquote;
          squote => start_string_squote;
          # The closing character of the open tag.
          ('>' | '/') => {
            fhold;
            fret;
          };
        *|;
        main := |*
          element_start  => start_element;
          doctype_start  => start_doctype;
          cdata_start    => start_cdata;
          comment_start  => start_comment;
          xml_decl_start => start_xml_decl;
          # Enter the body of the tag. If HTML mode is enabled and the current
          # element is a void element we'll close it and bail out.
          '>' => {
            if html? and HTML_VOID_ELEMENTS.include?(current_element)
              add_token(:T_ELEM_END, nil)
              @elements.pop
            end
          };
          # Regular closing tags.
          '</' element_name '>' => {
            emit_buffer
            add_token(:T_ELEM_END, nil)
            @elements.pop
          };
          # Self closing elements that are not handled by the HTML mode.
          '/>' => {
            add_token(:T_ELEM_END, nil)
            @elements.pop
          };
          # Note that this rule should be declared at the very bottom as it will
          # otherwise take precedence over the other rules.
          any => {
            # First character, start buffering (unless we already are buffering).
            start_buffer(@ts) unless buffering?
            # EOF, emit the text buffer.
            if @te == eof
              emit_buffer(@te)
            end
          };
        *|;
      }%%
    end # Lexer
  end # XML
 end # Oga
--- a/lib/oga/xml/parser.rb
+++ b/lib/oga/xml/parser.rb
@ -0,0 +1,402 @@
 #
 # DO NOT MODIFY!!!!
 # This file is automatically generated by Racc 1.4.11
 # from Racc grammer file "".
 #
 require 'racc/parser.rb'
 module Oga
  module XML
    class Parser < Racc::Parser
  ##
  # @param [Hash] options
  #
  # @option options [TrueClass|FalseClass] :html Enables HTML parsing mode.
  # @see Oga::Lexer#initialize
  #
  def initialize(options = {})
    @lexer = Lexer.new(options)
  end
  ##
  # Resets the internal state of the parser.
  #
  def reset
    @lines = []
    @line  = 1
  end
  ##
  # Emits a new AST token.
  #
  # @param [Symbol] type
  # @param [Array] children
  #
  def s(type, *children)
    return AST::Node.new(
      type,
      children.flatten,
      :line => @line
    )
  end
  ##
  # Returns the next token from the lexer.
  #
  # @return [Array]
  #
  def next_token
    type, value, line = @tokens.shift
    @line = line if line
    return type ? [type, value] : [false, false]
  end
  ##
  # @param [Fixnum] type The type of token the error occured on.
  # @param [String] value The value of the token.
  # @param [Array] stack The current stack of parsed nodes.
  # @raise [Racc::ParseError]
  #
  def on_error(type, value, stack)
    name  = token_to_str(type)
    index = @line - 1
    lines = ''
    # Show up to 5 lines before and after the offending line (if they exist).
    (-5..5).each do |offset|
      line   = @lines[index + offset]
      number = @line + offset
      if line and number > 0
        if offset == 0
          prefix = '=> '
        else
          prefix = '   '
        end
        lines << "#{prefix}#{number}: #{line.strip}\n"
      end
    end
    raise Racc::ParseError, <<-EOF
 Unexpected #{name} with value #{value.inspect} on line #{@line}:
 #{lines}
    EOF
  end
  ##
  # Parses the supplied string and returns the AST.
  #
  # @example
  #  parser = Oga::Parser.new
  #  ast    = parser.parse('<foo>bar</foo>')
  #
  # @param [String] string
  # @return [Oga::AST::Node]
  #
  def parse(string)
    @lines  = string.lines
    @tokens = @lexer.lex(string)
    ast     = do_parse
    reset
    return ast
  end
 # vim: set ft=racc:
 ##### State transition tables begin ###
 racc_action_table = [
    16,    40,    16,    10,    24,    37,    11,    22,    12,    28,
    14,    23,    21,    45,    31,    15,    16,    10,    44,    28,
    11,    43,    12,    36,    14,    35,    16,    10,    34,    15,
    11,    41,    12,    42,    14,    33,    16,    10,    17,    15,
    11,    46,    12,   nil,    14,    29,    30,    19,    20,    15 ]
 racc_action_check = [
    15,    28,    38,    38,    12,    24,    38,    11,    38,    13,
    38,    12,    11,    38,    15,    38,     2,     2,    35,    26,
     2,    35,     2,    22,     2,    20,    25,    25,    20,     2,
    25,    30,    25,    32,    25,    17,     0,     0,     1,    25,
     0,    44,     0,   nil,     0,    14,    14,    10,    10,     0 ]
 racc_action_pointer = [
    33,    38,    13,   nil,   nil,   nil,   nil,   nil,   nil,   nil,
    42,     4,     1,    -6,    33,    -3,   nil,    35,   nil,   nil,
    23,   nil,    15,   nil,    -5,    23,     4,   nil,    -1,   nil,
    19,   nil,    16,   nil,   nil,    16,   nil,   nil,    -1,   nil,
   nil,   nil,   nil,   nil,    36,   nil,   nil ]
 racc_action_default = [
    -2,   -32,    -1,    -4,    -6,    -7,    -8,    -9,   -10,   -11,
   -32,   -32,   -32,   -24,   -32,   -32,   -31,   -32,    -3,   -12,
   -32,   -16,   -32,   -18,   -32,    -5,   -23,   -26,   -27,   -21,
   -32,   -29,   -32,    47,   -13,   -32,   -17,   -19,   -32,   -25,
   -28,   -22,   -30,   -14,   -32,   -20,   -15 ]
 racc_goto_table = [
    18,     2,    27,    32,    25,    26,     1,   nil,   nil,   nil,
   nil,   nil,   nil,   nil,   nil,    39,   nil,   nil,   nil,   nil,
   nil,   nil,   nil,   nil,   nil,   nil,    38,   nil,   nil,   nil,
   nil,   nil,   nil,   nil,   nil,   nil,    18 ]
 racc_goto_check = [
     3,     2,    13,     8,    11,    12,     1,   nil,   nil,   nil,
   nil,   nil,   nil,   nil,   nil,    13,   nil,   nil,   nil,   nil,
   nil,   nil,   nil,   nil,   nil,   nil,     2,   nil,   nil,   nil,
   nil,   nil,   nil,   nil,   nil,   nil,     3 ]
 racc_goto_pointer = [
   nil,     6,     1,    -2,   nil,   nil,   nil,   nil,   -12,   nil,
   nil,    -9,    -8,   -11 ]
 racc_goto_default = [
   nil,   nil,   nil,     3,     4,     5,     6,     7,     8,     9,
    13,   nil,   nil,   nil ]
 racc_reduce_table = [
  0, 0, :racc_error,
  1, 19, :_reduce_1,
  0, 19, :_reduce_2,
  2, 20, :_reduce_3,
  1, 20, :_reduce_4,
  0, 20, :_reduce_5,
  1, 21, :_reduce_none,
  1, 21, :_reduce_none,
  1, 21, :_reduce_none,
  1, 21, :_reduce_none,
  1, 21, :_reduce_none,
  1, 21, :_reduce_none,
  2, 22, :_reduce_12,
  3, 22, :_reduce_13,
  4, 22, :_reduce_14,
  5, 22, :_reduce_15,
  2, 23, :_reduce_16,
  3, 23, :_reduce_17,
  2, 24, :_reduce_18,
  3, 24, :_reduce_19,
  4, 25, :_reduce_20,
  2, 28, :_reduce_21,
  3, 28, :_reduce_22,
  1, 29, :_reduce_23,
  0, 29, :_reduce_24,
  2, 30, :_reduce_25,
  1, 30, :_reduce_26,
  1, 31, :_reduce_27,
  2, 31, :_reduce_28,
  2, 27, :_reduce_29,
  3, 27, :_reduce_30,
  1, 26, :_reduce_31 ]
 racc_reduce_n = 32
 racc_shift_n = 47
 racc_token_table = {
  false => 0,
  :error => 1,
  :T_STRING => 2,
  :T_TEXT => 3,
  :T_DOCTYPE_START => 4,
  :T_DOCTYPE_END => 5,
  :T_DOCTYPE_TYPE => 6,
  :T_CDATA_START => 7,
  :T_CDATA_END => 8,
  :T_COMMENT_START => 9,
  :T_COMMENT_END => 10,
  :T_ELEM_START => 11,
  :T_ELEM_NAME => 12,
  :T_ELEM_NS => 13,
  :T_ELEM_END => 14,
  :T_ATTR => 15,
  :T_XML_DECL_START => 16,
  :T_XML_DECL_END => 17 }
 racc_nt_base = 18
 racc_use_result_var = false
 Racc_arg = [
  racc_action_table,
  racc_action_check,
  racc_action_default,
  racc_action_pointer,
  racc_goto_table,
  racc_goto_check,
  racc_goto_default,
  racc_goto_pointer,
  racc_nt_base,
  racc_reduce_table,
  racc_token_table,
  racc_shift_n,
  racc_reduce_n,
  racc_use_result_var ]
 Racc_token_to_s_table = [
  "$end",
  "error",
  "T_STRING",
  "T_TEXT",
  "T_DOCTYPE_START",
  "T_DOCTYPE_END",
  "T_DOCTYPE_TYPE",
  "T_CDATA_START",
  "T_CDATA_END",
  "T_COMMENT_START",
  "T_COMMENT_END",
  "T_ELEM_START",
  "T_ELEM_NAME",
  "T_ELEM_NS",
  "T_ELEM_END",
  "T_ATTR",
  "T_XML_DECL_START",
  "T_XML_DECL_END",
  "$start",
  "document",
  "expressions",
  "expression",
  "doctype",
  "cdata",
  "comment",
  "element",
  "text",
  "xmldecl",
  "element_open",
  "attributes",
  "attributes_",
  "attribute" ]
 Racc_debug_parser = false
 ##### State transition tables end #####
 # reduce 0 omitted
 def _reduce_1(val, _values)
 s(:document, val[0]) 
 end
 def _reduce_2(val, _values)
 s(:document) 
 end
 def _reduce_3(val, _values)
 val.compact 
 end
 def _reduce_4(val, _values)
 val[0] 
 end
 def _reduce_5(val, _values)
 nil 
 end
 # reduce 6 omitted
 # reduce 7 omitted
 # reduce 8 omitted
 # reduce 9 omitted
 # reduce 10 omitted
 # reduce 11 omitted
 def _reduce_12(val, _values)
 s(:doctype) 
 end
 def _reduce_13(val, _values)
        s(:doctype, val[1])
 end
 def _reduce_14(val, _values)
        s(:doctype, val[1], val[2])
 end
 def _reduce_15(val, _values)
        s(:doctype, val[1], val[2], val[3])
 end
 def _reduce_16(val, _values)
 s(:cdata) 
 end
 def _reduce_17(val, _values)
 s(:cdata, val[1]) 
 end
 def _reduce_18(val, _values)
 s(:comment) 
 end
 def _reduce_19(val, _values)
 s(:comment, val[1]) 
 end
 def _reduce_20(val, _values)
        s(:element, val[0], val[1], val[2])
 end
 def _reduce_21(val, _values)
 [nil, val[1]] 
 end
 def _reduce_22(val, _values)
 [val[1], val[2]] 
 end
 def _reduce_23(val, _values)
 s(:attributes, val[0]) 
 end
 def _reduce_24(val, _values)
 nil 
 end
 def _reduce_25(val, _values)
 val 
 end
 def _reduce_26(val, _values)
 val 
 end
 def _reduce_27(val, _values)
 s(:attribute, val[0]) 
 end
 def _reduce_28(val, _values)
 s(:attribute, val[0], val[1]) 
 end
 def _reduce_29(val, _values)
 s(:xml_decl) 
 end
 def _reduce_30(val, _values)
 s(:xml_decl, val[1]) 
 end
 def _reduce_31(val, _values)
 s(:text, val[0]) 
 end
 def _reduce_none(val, _values)
  val[0]
 end
    end   # class Parser
    end   # module XML
  end   # module Oga
--- a/lib/oga/xml/parser.y
+++ b/lib/oga/xml/parser.y
@ -5,9 +5,9 @@
 # It requires every tag to have a closing tag. As such you'll need to enable
 # HTML parsing mode when parsing HTML. This can be done as following:
 #
-#     parser = Oga::Parser.new(:html => true)
+#     parser = Oga::XML::Parser.new(:html => true)
 #
-class Oga::Parser
+class Oga::XML::Parser
 token T_STRING T_TEXT
 token T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE
--- a/spec/oga/lexer/cdata_spec.rb
+++ b/spec/oga/lexer/cdata_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'
-describe Oga::Lexer do
+describe Oga::XML::Lexer do
  context 'cdata tags' do
    example 'lex a cdata tag' do
      lex('<![CDATA[foo]]>').should == [
--- a/spec/oga/lexer/comments_spec.rb
+++ b/spec/oga/lexer/comments_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'
-describe Oga::Lexer do
+describe Oga::XML::Lexer do
  context 'comments' do
    example 'lex a comment' do
      lex('<!-- foo -->').should == [
--- a/spec/oga/lexer/doctype_spec.rb
+++ b/spec/oga/lexer/doctype_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'
-describe Oga::Lexer do
+describe Oga::XML::Lexer do
  context 'doctypes' do
    example 'lex the HTML5 doctype' do
      lex('<!DOCTYPE html>').should == [
--- a/spec/oga/lexer/documents_spec.rb
+++ b/spec/oga/lexer/documents_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'
-describe Oga::Lexer do
+describe Oga::XML::Lexer do
  context 'HTML documents' do
    example 'lex a basic HTML document' do
      html = <<-EOF
--- a/spec/oga/lexer/elements_spec.rb
+++ b/spec/oga/lexer/elements_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'
-describe Oga::Lexer do
+describe Oga::XML::Lexer do
  context 'elements' do
    example 'lex an opening element' do
      lex('<p>').should == [
--- a/spec/oga/lexer/general_spec.rb
+++ b/spec/oga/lexer/general_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'
-describe Oga::Lexer do
+describe Oga::XML::Lexer do
  context 'regular text' do
    example 'lex regular text' do
      lex('hello').should == [[:T_TEXT, 'hello', 1]]
--- a/spec/oga/lexer/html_void_elements_spec.rb
+++ b/spec/oga/lexer/html_void_elements_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'
-describe Oga::Lexer do
+describe Oga::XML::Lexer do
  context 'HTML void elements' do
    example 'lex a void element that omits the closing /' do
      lex('<link>', :html => true).should == [
--- a/spec/oga/lexer/xml_declaration_spec.rb
+++ b/spec/oga/lexer/xml_declaration_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'
-describe Oga::Lexer do
+describe Oga::XML::Lexer do
  context 'XML declaration tags' do
    example 'lex a start tag' do
      lex('<?xml').should == [[:T_XML_DECL_START, nil, 1]]
--- a/spec/oga/parser/cdata_spec.rb
+++ b/spec/oga/parser/cdata_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'
-describe Oga::Parser do
+describe Oga::XML::Parser do
  context 'cdata tags' do
    example 'parse a cdata tag' do
      parse('<![CDATA[foo]]>').should == s(:document, s(:cdata, 'foo'))
--- a/spec/oga/parser/comments_spec.rb
+++ b/spec/oga/parser/comments_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'
-describe Oga::Parser do
+describe Oga::XML::Parser do
  context 'comments' do
    example 'parse an empty comment' do
      parse('<!---->').should == s(:document, s(:comment))
--- a/spec/oga/parser/doctype_spec.rb
+++ b/spec/oga/parser/doctype_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'
-describe Oga::Parser do
+describe Oga::XML::Parser do
  context 'doctypes' do
    example 'parse a doctype' do
      parse('<!DOCTYPE html>').should == s(:document, s(:doctype))
--- a/spec/oga/parser/documents_spec.rb
+++ b/spec/oga/parser/documents_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'
-describe Oga::Parser do
+describe Oga::XML::Parser do
  context 'HTML documents' do
    example 'parse a basic HTML document' do
      html = <<-EOF
--- a/spec/oga/parser/elements_spec.rb
+++ b/spec/oga/parser/elements_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'
-describe Oga::Parser do
+describe Oga::XML::Parser do
  context 'elements' do
    example 'parse an empty element' do
      parse('<p></p>').should == s(
--- a/spec/oga/parser/general_spec.rb
+++ b/spec/oga/parser/general_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'
-describe Oga::Parser do
+describe Oga::XML::Parser do
  example 'parse regular text' do
    parse('foo').should == s(:document, s(:text, 'foo'))
  end
--- a/spec/oga/parser/html_void_elements_spec.rb
+++ b/spec/oga/parser/html_void_elements_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'
-describe Oga::Parser do
+describe Oga::XML::Parser do
  context 'HTML void elements' do
    example 'parse a void element that omits the closing /' do
      parse('<link>', :html => true).should == s(
--- a/spec/oga/parser/xml_declaration_spec.rb
+++ b/spec/oga/parser/xml_declaration_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'
-describe Oga::Parser do
+describe Oga::XML::Parser do
  context 'XML declaration tags' do
    example 'lex an XML declaration tag' do
      parse('<?xml hello ?>').should == s(
--- a/spec/support/parsing.rb
+++ b/spec/support/parsing.rb
@ -19,7 +19,7 @@ module Oga
    # @return [Array]
    #
    def lex(input, options = {})
-      return Oga::Lexer.new(options).lex(input)
+      return Oga::XML::Lexer.new(options).lex(input)
    end
    ##
@ -30,7 +30,7 @@ module Oga
    # @return [Oga::AST::Node]
    #
    def parse(input, options = {})
-      return Oga::Parser.new(options).parse(input)
+      return Oga::XML::Parser.new(options).parse(input)
    end
  end # ParsingHelpers
 end # Oga