Fix for lexing newlines in doctypes

This also ensures that newlines are advanced properly. Fixes #95
2015-04-15 20:22:14 +02:00 · 2015-04-15 20:22:14 +02:00 · 9a0e31d0ae
parent a08829add5
commit 9a0e31d0ae
2 changed files with 41 additions and 13 deletions
--- a/ext/ragel/base_lexer.rl
+++ b/ext/ragel/base_lexer.rl
@ -47,14 +47,19 @@
    #
    newline    = '\r\n' | '\n' | '\r';
    whitespace = [ \t];
    ident_char = [a-zA-Z0-9\-_];
    identifier = ident_char+;
    whitespace_or_newline = whitespace | newline;
    action count_newlines {
        if ( fc == '\n' ) lines++;
    }
-    whitespace = [ \t];
+    action advance_newline {
-    ident_char = [a-zA-Z0-9\-_];
+        advance_line(1)
-    identifier = ident_char+;
+    }
    # Comments
    #
@ -240,10 +245,18 @@
    # 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
    # 3. Legacy doctypes
    #
-    doctype_start = '<!DOCTYPE'i whitespace+;
+    doctype_start = '<!DOCTYPE'i (whitespace_or_newline+ $count_newlines);
    action start_doctype {
        callback_simple(id_on_doctype_start);
        if ( lines > 0 )
        {
            advance_line(lines);
            lines = 0;
        }
        fnext doctype;
    }
@ -277,10 +290,6 @@
        squote => start_string_squote;
        dquote => start_string_dquote;
        # Whitespace inside doctypes is ignored since there's no point in
        # including it.
        whitespace;
        identifier => {
            callback(id_on_doctype_name, data, encoding, ts, te);
        };
@ -289,6 +298,10 @@
            callback_simple(id_on_doctype_end);
            fnext main;
        };
        newline => advance_newline;
        whitespace;
    *|;
    # XML declaration tags
@ -379,7 +392,7 @@
    # See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
    # for more info.
    html_unquoted_value = ^(
-        squote | dquote | '`' | '=' | '<' | '>' | whitespace | newline
+        squote | dquote | '`' | '=' | '<' | '>' | whitespace_or_newline
    )+;
    # Machine used for processing HTML attribute values.
@ -414,9 +427,7 @@
    element_head := |*
        whitespace;
-        newline => {
+        newline => advance_newline;
            callback_simple(id_advance_line);
        };
        # Attribute names and namespaces.
        identifier ':' => {
--- a/spec/oga/xml/lexer/doctype_spec.rb
+++ b/spec/oga/xml/lexer/doctype_spec.rb
@ -10,6 +10,23 @@ describe Oga::XML::Lexer do
      ]
    end
    it 'lexes a doctype containing a newline before the doctype name' do
      lex("<!DOCTYPE\nhtml>").should == [
        [:T_DOCTYPE_START, nil, 1],
        [:T_DOCTYPE_NAME, 'html', 2],
        [:T_DOCTYPE_END, nil, 2]
      ]
    end
    it 'lexes a doctype with a public ID preceded by a newline' do
      lex("<!DOCTYPE html\nPUBLIC>").should == [
        [:T_DOCTYPE_START, nil, 1],
        [:T_DOCTYPE_NAME, 'html', 1],
        [:T_DOCTYPE_TYPE, 'PUBLIC', 2],
        [:T_DOCTYPE_END, nil, 2]
      ]
    end
    it 'lexes a doctype with a public and system ID' do
      lex('<!DOCTYPE HTML PUBLIC "foobar" "baz">').should == [
        [:T_DOCTYPE_START, nil, 1],