Fix for lexing newlines in doctypes

This also ensures that newlines are advanced properly.

Fixes #95
This commit is contained in:
Yorick Peterse 2015-04-15 20:22:14 +02:00
parent a08829add5
commit 9a0e31d0ae
2 changed files with 41 additions and 13 deletions

View File

@ -47,14 +47,19 @@
# #
newline = '\r\n' | '\n' | '\r'; newline = '\r\n' | '\n' | '\r';
whitespace = [ \t];
ident_char = [a-zA-Z0-9\-_];
identifier = ident_char+;
whitespace_or_newline = whitespace | newline;
action count_newlines { action count_newlines {
if ( fc == '\n' ) lines++; if ( fc == '\n' ) lines++;
} }
whitespace = [ \t]; action advance_newline {
ident_char = [a-zA-Z0-9\-_]; advance_line(1)
identifier = ident_char+; }
# Comments # Comments
# #
@ -240,10 +245,18 @@
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5. # 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
# 3. Legacy doctypes # 3. Legacy doctypes
# #
doctype_start = '<!DOCTYPE'i whitespace+; doctype_start = '<!DOCTYPE'i (whitespace_or_newline+ $count_newlines);
action start_doctype { action start_doctype {
callback_simple(id_on_doctype_start); callback_simple(id_on_doctype_start);
if ( lines > 0 )
{
advance_line(lines);
lines = 0;
}
fnext doctype; fnext doctype;
} }
@ -277,10 +290,6 @@
squote => start_string_squote; squote => start_string_squote;
dquote => start_string_dquote; dquote => start_string_dquote;
# Whitespace inside doctypes is ignored since there's no point in
# including it.
whitespace;
identifier => { identifier => {
callback(id_on_doctype_name, data, encoding, ts, te); callback(id_on_doctype_name, data, encoding, ts, te);
}; };
@ -289,6 +298,10 @@
callback_simple(id_on_doctype_end); callback_simple(id_on_doctype_end);
fnext main; fnext main;
}; };
newline => advance_newline;
whitespace;
*|; *|;
# XML declaration tags # XML declaration tags
@ -379,7 +392,7 @@
# See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example # See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
# for more info. # for more info.
html_unquoted_value = ^( html_unquoted_value = ^(
squote | dquote | '`' | '=' | '<' | '>' | whitespace | newline squote | dquote | '`' | '=' | '<' | '>' | whitespace_or_newline
)+; )+;
# Machine used for processing HTML attribute values. # Machine used for processing HTML attribute values.
@ -414,9 +427,7 @@
element_head := |* element_head := |*
whitespace; whitespace;
newline => { newline => advance_newline;
callback_simple(id_advance_line);
};
# Attribute names and namespaces. # Attribute names and namespaces.
identifier ':' => { identifier ':' => {

View File

@ -10,6 +10,23 @@ describe Oga::XML::Lexer do
] ]
end end
it 'lexes a doctype containing a newline before the doctype name' do
lex("<!DOCTYPE\nhtml>").should == [
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'html', 2],
[:T_DOCTYPE_END, nil, 2]
]
end
it 'lexes a doctype with a public ID preceded by a newline' do
lex("<!DOCTYPE html\nPUBLIC>").should == [
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'html', 1],
[:T_DOCTYPE_TYPE, 'PUBLIC', 2],
[:T_DOCTYPE_END, nil, 2]
]
end
it 'lexes a doctype with a public and system ID' do it 'lexes a doctype with a public and system ID' do
lex('<!DOCTYPE HTML PUBLIC "foobar" "baz">').should == [ lex('<!DOCTYPE HTML PUBLIC "foobar" "baz">').should == [
[:T_DOCTYPE_START, nil, 1], [:T_DOCTYPE_START, nil, 1],