Support for inline doctype rules plus newlines.
This adds support for lexing/parsing XML documents that use an IO as input _and_ contain doctype rules with newlines in them. This fixes #63.
This commit is contained in:
parent
f88df486ba
commit
cbb2815146
|
@ -183,6 +183,22 @@
|
||||||
fnext doctype;
|
fnext doctype;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Machine for processing inline rules of a doctype.
|
||||||
|
doctype_inline := |*
|
||||||
|
^']'* $count_newlines => {
|
||||||
|
callback("on_doctype_inline", data, encoding, ts, te);
|
||||||
|
|
||||||
|
if ( lines > 0 )
|
||||||
|
{
|
||||||
|
advance_line(lines);
|
||||||
|
|
||||||
|
lines = 0;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
']' => { fnext doctype; };
|
||||||
|
*|;
|
||||||
|
|
||||||
# Machine for processing doctypes. Doctype values such as the public
|
# Machine for processing doctypes. Doctype values such as the public
|
||||||
# and system IDs are treated as T_STRING tokens.
|
# and system IDs are treated as T_STRING tokens.
|
||||||
doctype := |*
|
doctype := |*
|
||||||
|
@ -190,11 +206,8 @@
|
||||||
callback("on_doctype_type", data, encoding, ts, te);
|
callback("on_doctype_type", data, encoding, ts, te);
|
||||||
};
|
};
|
||||||
|
|
||||||
# Consumes everything between the [ and ]. Due to the use of :> the ]
|
# Starts a set of inline doctype rules.
|
||||||
# is not consumed by any+.
|
'[' => { fnext doctype_inline; };
|
||||||
'[' any+ :> ']' => {
|
|
||||||
callback("on_doctype_inline", data, encoding, ts + 1, te - 1);
|
|
||||||
};
|
|
||||||
|
|
||||||
# Lex the public/system IDs as regular strings.
|
# Lex the public/system IDs as regular strings.
|
||||||
squote => start_string_squote;
|
squote => start_string_squote;
|
||||||
|
|
|
@ -86,12 +86,17 @@ rule
|
||||||
}
|
}
|
||||||
|
|
||||||
# <!DOCTYPE html [ ... ]>
|
# <!DOCTYPE html [ ... ]>
|
||||||
| T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_INLINE T_DOCTYPE_END
|
| T_DOCTYPE_START T_DOCTYPE_NAME doctype_inline T_DOCTYPE_END
|
||||||
{
|
{
|
||||||
on_doctype(:name => val[1], :inline_rules => val[2])
|
on_doctype(:name => val[1], :inline_rules => val[2])
|
||||||
}
|
}
|
||||||
;
|
;
|
||||||
|
|
||||||
|
doctype_inline
|
||||||
|
: T_DOCTYPE_INLINE { val[0] }
|
||||||
|
| doctype_inline T_DOCTYPE_INLINE { val[0] + val[1] }
|
||||||
|
;
|
||||||
|
|
||||||
# CDATA tags
|
# CDATA tags
|
||||||
|
|
||||||
cdata
|
cdata
|
||||||
|
|
|
@ -49,6 +49,46 @@ describe Oga::XML::Lexer do
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
example 'lex an empty inline doctype' do
|
||||||
|
lex('<!DOCTYPE html []>').should == [
|
||||||
|
[:T_DOCTYPE_START, nil, 1],
|
||||||
|
[:T_DOCTYPE_NAME, 'html', 1],
|
||||||
|
[:T_DOCTYPE_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
example 'lex an inline doctype containing a newline' do
|
||||||
|
lex("<!DOCTYPE html [foo\n]>").should == [
|
||||||
|
[:T_DOCTYPE_START, nil, 1],
|
||||||
|
[:T_DOCTYPE_NAME, 'html', 1],
|
||||||
|
[:T_DOCTYPE_INLINE, "foo\n", 1],
|
||||||
|
[:T_DOCTYPE_END, nil, 2]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
example 'lex an inline doctype containing a trailing newline using an IO' do
|
||||||
|
input = StringIO.new("<!DOCTYPE html [foo\n]>")
|
||||||
|
|
||||||
|
lex(input).should == [
|
||||||
|
[:T_DOCTYPE_START, nil, 1],
|
||||||
|
[:T_DOCTYPE_NAME, 'html', 1],
|
||||||
|
[:T_DOCTYPE_INLINE, "foo\n", 1],
|
||||||
|
[:T_DOCTYPE_END, nil, 2]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
example 'lex an inline doctype containing a leading newline using an IO' do
|
||||||
|
input = StringIO.new("<!DOCTYPE html [\nfoo]>")
|
||||||
|
|
||||||
|
lex(input).should == [
|
||||||
|
[:T_DOCTYPE_START, nil, 1],
|
||||||
|
[:T_DOCTYPE_NAME, 'html', 1],
|
||||||
|
[:T_DOCTYPE_INLINE, "\n", 1],
|
||||||
|
[:T_DOCTYPE_INLINE, "foo", 2],
|
||||||
|
[:T_DOCTYPE_END, nil, 2]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
# Technically not valid, put in place to make sure that the Ragel rules are
|
# Technically not valid, put in place to make sure that the Ragel rules are
|
||||||
# not too greedy.
|
# not too greedy.
|
||||||
example 'lex an inline doftype followed by a system ID' do
|
example 'lex an inline doftype followed by a system ID' do
|
||||||
|
|
|
@ -94,4 +94,14 @@ describe Oga::XML::Parser do
|
||||||
@document.doctype.inline_rules.should == '<!ELEMENT foo>'
|
@document.doctype.inline_rules.should == '<!ELEMENT foo>'
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context 'doctypes with inline rules and newlines using a StringIO' do
|
||||||
|
before :all do
|
||||||
|
@document = parse(StringIO.new("<!DOCTYPE html [\nfoo]>"))
|
||||||
|
end
|
||||||
|
|
||||||
|
example 'set the inline doctype rules' do
|
||||||
|
@document.doctype.inline_rules.should == "\nfoo"
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue