Fix for lexing newlines in doctypes
This also ensures that newlines are advanced properly. Fixes #95
This commit is contained in:
parent
a08829add5
commit
9a0e31d0ae
|
@ -47,14 +47,19 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
newline = '\r\n' | '\n' | '\r';
|
newline = '\r\n' | '\n' | '\r';
|
||||||
|
whitespace = [ \t];
|
||||||
|
ident_char = [a-zA-Z0-9\-_];
|
||||||
|
identifier = ident_char+;
|
||||||
|
|
||||||
|
whitespace_or_newline = whitespace | newline;
|
||||||
|
|
||||||
action count_newlines {
|
action count_newlines {
|
||||||
if ( fc == '\n' ) lines++;
|
if ( fc == '\n' ) lines++;
|
||||||
}
|
}
|
||||||
|
|
||||||
whitespace = [ \t];
|
action advance_newline {
|
||||||
ident_char = [a-zA-Z0-9\-_];
|
advance_line(1)
|
||||||
identifier = ident_char+;
|
}
|
||||||
|
|
||||||
# Comments
|
# Comments
|
||||||
#
|
#
|
||||||
|
@ -240,10 +245,18 @@
|
||||||
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
|
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
|
||||||
# 3. Legacy doctypes
|
# 3. Legacy doctypes
|
||||||
#
|
#
|
||||||
doctype_start = '<!DOCTYPE'i whitespace+;
|
doctype_start = '<!DOCTYPE'i (whitespace_or_newline+ $count_newlines);
|
||||||
|
|
||||||
action start_doctype {
|
action start_doctype {
|
||||||
callback_simple(id_on_doctype_start);
|
callback_simple(id_on_doctype_start);
|
||||||
|
|
||||||
|
if ( lines > 0 )
|
||||||
|
{
|
||||||
|
advance_line(lines);
|
||||||
|
|
||||||
|
lines = 0;
|
||||||
|
}
|
||||||
|
|
||||||
fnext doctype;
|
fnext doctype;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -277,10 +290,6 @@
|
||||||
squote => start_string_squote;
|
squote => start_string_squote;
|
||||||
dquote => start_string_dquote;
|
dquote => start_string_dquote;
|
||||||
|
|
||||||
# Whitespace inside doctypes is ignored since there's no point in
|
|
||||||
# including it.
|
|
||||||
whitespace;
|
|
||||||
|
|
||||||
identifier => {
|
identifier => {
|
||||||
callback(id_on_doctype_name, data, encoding, ts, te);
|
callback(id_on_doctype_name, data, encoding, ts, te);
|
||||||
};
|
};
|
||||||
|
@ -289,6 +298,10 @@
|
||||||
callback_simple(id_on_doctype_end);
|
callback_simple(id_on_doctype_end);
|
||||||
fnext main;
|
fnext main;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
newline => advance_newline;
|
||||||
|
|
||||||
|
whitespace;
|
||||||
*|;
|
*|;
|
||||||
|
|
||||||
# XML declaration tags
|
# XML declaration tags
|
||||||
|
@ -379,7 +392,7 @@
|
||||||
# See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
|
# See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
|
||||||
# for more info.
|
# for more info.
|
||||||
html_unquoted_value = ^(
|
html_unquoted_value = ^(
|
||||||
squote | dquote | '`' | '=' | '<' | '>' | whitespace | newline
|
squote | dquote | '`' | '=' | '<' | '>' | whitespace_or_newline
|
||||||
)+;
|
)+;
|
||||||
|
|
||||||
# Machine used for processing HTML attribute values.
|
# Machine used for processing HTML attribute values.
|
||||||
|
@ -414,9 +427,7 @@
|
||||||
element_head := |*
|
element_head := |*
|
||||||
whitespace;
|
whitespace;
|
||||||
|
|
||||||
newline => {
|
newline => advance_newline;
|
||||||
callback_simple(id_advance_line);
|
|
||||||
};
|
|
||||||
|
|
||||||
# Attribute names and namespaces.
|
# Attribute names and namespaces.
|
||||||
identifier ':' => {
|
identifier ':' => {
|
||||||
|
|
|
@ -10,6 +10,23 @@ describe Oga::XML::Lexer do
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it 'lexes a doctype containing a newline before the doctype name' do
|
||||||
|
lex("<!DOCTYPE\nhtml>").should == [
|
||||||
|
[:T_DOCTYPE_START, nil, 1],
|
||||||
|
[:T_DOCTYPE_NAME, 'html', 2],
|
||||||
|
[:T_DOCTYPE_END, nil, 2]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'lexes a doctype with a public ID preceded by a newline' do
|
||||||
|
lex("<!DOCTYPE html\nPUBLIC>").should == [
|
||||||
|
[:T_DOCTYPE_START, nil, 1],
|
||||||
|
[:T_DOCTYPE_NAME, 'html', 1],
|
||||||
|
[:T_DOCTYPE_TYPE, 'PUBLIC', 2],
|
||||||
|
[:T_DOCTYPE_END, nil, 2]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
it 'lexes a doctype with a public and system ID' do
|
it 'lexes a doctype with a public and system ID' do
|
||||||
lex('<!DOCTYPE HTML PUBLIC "foobar" "baz">').should == [
|
lex('<!DOCTYPE HTML PUBLIC "foobar" "baz">').should == [
|
||||||
[:T_DOCTYPE_START, nil, 1],
|
[:T_DOCTYPE_START, nil, 1],
|
||||||
|
|
Loading…
Reference in New Issue