diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index fe12a53..dca0eff 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -46,15 +46,20 @@ # stack. # - newline = '\r\n' | '\n' | '\r'; + newline = '\r\n' | '\n' | '\r'; + whitespace = [ \t]; + ident_char = [a-zA-Z0-9\-_]; + identifier = ident_char+; + + whitespace_or_newline = whitespace | newline; action count_newlines { if ( fc == '\n' ) lines++; } - whitespace = [ \t]; - ident_char = [a-zA-Z0-9\-_]; - identifier = ident_char+; + action advance_newline { + advance_line(1) + } # Comments # @@ -240,10 +245,18 @@ # 2. Deprecated doctypes, the more verbose ones used prior to HTML5. # 3. Legacy doctypes # - doctype_start = ' 0 ) + { + advance_line(lines); + + lines = 0; + } + fnext doctype; } @@ -277,10 +290,6 @@ squote => start_string_squote; dquote => start_string_dquote; - # Whitespace inside doctypes is ignored since there's no point in - # including it. - whitespace; - identifier => { callback(id_on_doctype_name, data, encoding, ts, te); }; @@ -289,6 +298,10 @@ callback_simple(id_on_doctype_end); fnext main; }; + + newline => advance_newline; + + whitespace; *|; # XML declaration tags @@ -379,7 +392,7 @@ # See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example # for more info. html_unquoted_value = ^( - squote | dquote | '`' | '=' | '<' | '>' | whitespace | newline + squote | dquote | '`' | '=' | '<' | '>' | whitespace_or_newline )+; # Machine used for processing HTML attribute values. @@ -414,9 +427,7 @@ element_head := |* whitespace; - newline => { - callback_simple(id_advance_line); - }; + newline => advance_newline; # Attribute names and namespaces. identifier ':' => { diff --git a/spec/oga/xml/lexer/doctype_spec.rb b/spec/oga/xml/lexer/doctype_spec.rb index 3749b37..66d2d20 100644 --- a/spec/oga/xml/lexer/doctype_spec.rb +++ b/spec/oga/xml/lexer/doctype_spec.rb @@ -10,6 +10,23 @@ describe Oga::XML::Lexer do ] end + it 'lexes a doctype containing a newline before the doctype name' do + lex("").should == [ + [:T_DOCTYPE_START, nil, 1], + [:T_DOCTYPE_NAME, 'html', 2], + [:T_DOCTYPE_END, nil, 2] + ] + end + + it 'lexes a doctype with a public ID preceded by a newline' do + lex("").should == [ + [:T_DOCTYPE_START, nil, 1], + [:T_DOCTYPE_NAME, 'html', 1], + [:T_DOCTYPE_TYPE, 'PUBLIC', 2], + [:T_DOCTYPE_END, nil, 2] + ] + end + it 'lexes a doctype with a public and system ID' do lex('').should == [ [:T_DOCTYPE_START, nil, 1],