Fixed lexing of XML comments.

The previous setup would consume too much. For example the following HTML: <a><b></b></a> would result in the following T_COMMENT token: "foo--><b><!--bar" The new setup requires the marking of a start position. I'm not a huge fan of this but there doesn't appear to be a way around this.
2014-08-15 20:36:40 +02:00 · 2014-08-15 20:36:40 +02:00 · 81edce2eb8
parent 4d7f224892
commit 81edce2eb8
4 changed files with 47 additions and 15 deletions
--- a/ext/c/lexer.rl
+++ b/ext/c/lexer.rl
@ -77,11 +77,12 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)

    char *data_str_val = StringValuePtr(data_block);

-    const char *p   = data_str_val;
-    const char *pe  = data_str_val + strlen(data_str_val);
-    const char *eof = pe;
-    const char *ts  = 0;
-    const char *te  = 0;
+    const char *p    = data_str_val;
+    const char *pe   = data_str_val + strlen(data_str_val);
+    const char *eof  = pe;
+    const char *ts   = 0;
+    const char *te   = 0;
+    const char *mark = 0;

    int act = NUM2INT(oga_ivar_get(self, "@act"));
    int cs  = NUM2INT(oga_ivar_get(self, "@cs"));
--- a/ext/java/org/liboga/xml/Lexer.rl
+++ b/ext/java/org/liboga/xml/Lexer.rl
@ -90,11 +90,12 @@ public class Lexer extends RubyObject

        byte[] data = rb_str.getBytes();

-        int ts  = 0;
-        int te  = 0;
-        int p   = 0;
-        int pe  = data.length;
-        int eof = data.length;
+        int ts   = 0;
+        int te   = 0;
+        int p    = 0;
+        int mark = 0;
+        int pe   = data.length;
+        int eof  = data.length;

        %% write exec;

--- a/ext/ragel/base_lexer.rl
+++ b/ext/ragel/base_lexer.rl
@ -47,7 +47,27 @@
    # such as `--` and `->`. Putting extra checks in for these sequences would
    # actually make the rules/actions more complex.
    #
-    comment = '<!--' any* '-->';
+
+    comment_start = '<!--';
+    comment_end   = '-->';
+
+    action start_comment {
+        mark = ts + 4;
+
+        fnext comment_body;
+    }
+
+    comment_body := |*
+        comment_end => {
+            callback("on_comment", data, encoding, mark, te - 3);
+
+            mark = 0;
+
+            fnext main;
+        };
+
+        any;
+    *|;

    # CDATA
    #
@ -215,10 +235,7 @@
    main := |*
        doctype_start  => start_doctype;
        xml_decl_start => start_xml_decl;
-
-        comment => {
-            callback("on_comment", data, encoding, ts + 4, te - 3);
-        };
+        comment_start  => start_comment;

        cdata => {
            callback("on_cdata", data, encoding, ts + 9, te - 3);
--- a/spec/oga/xml/lexer/comments_spec.rb
+++ b/spec/oga/xml/lexer/comments_spec.rb
@ -36,5 +36,18 @@ describe Oga::XML::Lexer do
        [:T_COMMENT, '', 1]
      ]
    end
+
+    example 'lex two comments following each other' do
+      lex('<a><!--foo--><b><!--bar--></b></a>').should == [
+        [:T_ELEM_START, nil, 1],
+        [:T_ELEM_NAME, 'a', 1],
+        [:T_COMMENT, 'foo', 1],
+        [:T_ELEM_START, nil, 1],
+        [:T_ELEM_NAME, 'b', 1],
+        [:T_COMMENT, 'bar', 1],
+        [:T_ELEM_END, nil, 1],
+        [:T_ELEM_END, nil, 1]
+      ]
+    end
  end
 end