Removed the buffering crap from the lexer.

2014-05-04 17:39:08 +02:00 · 2014-05-04 17:39:08 +02:00 · f18e8893de
parent 57255012b7
commit f18e8893de
1 changed files with 25 additions and 67 deletions
--- a/lib/oga/xml/lexer.rl
+++ b/lib/oga/xml/lexer.rl
@ -216,26 +216,21 @@ module Oga
      end
      ##
-      # Emits the current buffer if we have any. The current line number is
+      # Emits a text token.
      # advanced based on the amount of newlines in the buffer.
      #
-      # @param [Fixnum] position The end position of the buffer.
+      # @param [Fixnum] start
-      # @param [Symbol] type The type of node to emit.
+      # @param [Fixnum] stop
      #
-      def emit_buffer(position, type = :T_TEXT)
+      def emit_text(start, stop)
-        return unless @buffer_start_position
+        content = text(start, stop)
        content = text(@buffer_start_position, position)
        unless content.empty?
-          add_token(type, content)
+          add_token(:T_TEXT, content)
          lines = content.count("\n")
          advance_line(lines) if lines > 0
        end
        @buffer_start_position = nil
      end
      ##
@ -262,36 +257,22 @@ module Oga
        dquote = '"';
        squote = "'";
        action start_string_dquote {
          start_buffer(te)
          fcall string_dquote;
        }
        action start_string_squote {
          start_buffer(te)
          fcall string_squote;
        }
        # Machine for processing double quoted strings.
        string_dquote := |*
-          dquote => {
+           ^dquote+ => {
-            emit_buffer(ts, :T_STRING)
+             emit(:T_STRING, ts, te)
            fret;
           };
-          any;
+           dquote => { fret; };
        *|;
        # Machine for processing single quoted strings.
        string_squote := |*
-          squote => {
+          ^squote+ => {
-            emit_buffer(ts, :T_STRING)
+            emit(:T_STRING, ts, te)
            fret;
          };
-          any;
+          squote => { fret; };
        *|;
        # DOCTYPES
@ -307,7 +288,6 @@ module Oga
        doctype_start = '<!DOCTYPE'i whitespace+;
        action start_doctype {
          emit_buffer(ts)
          add_token(:T_DOCTYPE_START)
          fcall doctype;
        }
@ -318,8 +298,8 @@ module Oga
          'PUBLIC' | 'SYSTEM' => { emit(:T_DOCTYPE_TYPE, ts, te) };
          # Lex the public/system IDs as regular strings.
-          dquote => start_string_dquote;
+          dquote => { fcall string_dquote; };
-          squote => start_string_squote;
+          squote => { fcall string_squote; };
          # Whitespace inside doctypes is ignored since there's no point in
          # including it.
@ -347,25 +327,20 @@ module Oga
        cdata_end   = ']]>';
        action start_cdata {
          emit_buffer(ts)
          add_token(:T_CDATA_START)
          start_buffer(te)
          fcall cdata;
        }
        # Machine that for processing the contents of CDATA tags. Everything
        # inside a CDATA tag is treated as plain text.
        cdata := |*
-          cdata_end => {
+          any* cdata_end => {
-            emit_buffer(ts)
+            emit_text(ts, te - 3)
            add_token(:T_CDATA_END)
            fret;
          };
          any;
        *|;
        # Comments
@ -383,25 +358,20 @@ module Oga
        comment_end   = '-->';
        action start_comment {
          emit_buffer(ts)
          add_token(:T_COMMENT_START)
          start_buffer(te)
          fcall comment;
        }
        # Machine used for processing the contents of a comment. Everything
        # inside a comment is treated as plain text (similar to CDATA tags).
        comment := |*
-          comment_end => {
+          any* comment_end => {
-            emit_buffer(ts)
+            emit_text(ts, te - 3)
            add_token(:T_COMMENT_END)
            fret;
          };
          any;
        *|;
        # XML declaration tags
@ -412,18 +382,14 @@ module Oga
        xml_decl_end   = '?>';
        action start_xml_decl {
          emit_buffer(ts)
          add_token(:T_XML_DECL_START)
          start_buffer(te)
          fcall xml_decl;
        }
        # Machine that processes the contents of an XML declaration tag.
        xml_decl := |*
          xml_decl_end => {
            emit_buffer(ts)
            add_token(:T_XML_DECL_END)
            fret;
@ -432,8 +398,8 @@ module Oga
          # Attributes and their values (e.g. version="1.0").
          identifier => { emit(:T_ATTR, ts, te) };
-          dquote => start_string_dquote;
+          dquote => { fcall string_dquote; };
-          squote => start_string_squote;
+          squote => { fcall string_squote; };
          any;
        *|;
@ -447,7 +413,6 @@ module Oga
        # namespace (if any). Remaining work is delegated to a dedicated
        # machine.
        action start_element {
          emit_buffer(ts)
          add_token(:T_ELEM_START)
          # Add the element name. If the name includes a namespace we'll break
@ -484,8 +449,8 @@ module Oga
          identifier => { emit(:T_ATTR, ts, te) };
          # Attribute values.
-          dquote => start_string_dquote;
+          dquote => { fcall string_dquote; };
-          squote => start_string_squote;
+          squote => { fcall string_squote; };
          # The closing character of the open tag.
          ('>' | '/') => {
@ -512,7 +477,6 @@ module Oga
          # Regular closing tags.
          '</' identifier '>' => {
            emit_buffer(ts)
            add_token(:T_ELEM_END, nil)
            @elements.pop if html?
@ -527,14 +491,8 @@ module Oga
          # Note that this rule should be declared at the very bottom as it
          # will otherwise take precedence over the other rules.
-          any => {
+          ^('<' | '>')+ => {
-            # First character, start buffering (unless we already are buffering).
+            emit_text(ts, te)
            start_buffer(ts) unless @buffer_start_position
            # EOF, emit the text buffer.
            if te == eof
              emit_buffer(te)
            end
          };
        *|;
      }%%