Support for inline doctype rules plus newlines.

This adds support for lexing/parsing XML documents that use an IO as input _and_
contain doctype rules with newlines in them.

This fixes #63.
This commit is contained in:
Yorick Peterse 2014-11-18 20:02:55 +01:00
parent f88df486ba
commit cbb2815146
4 changed files with 74 additions and 6 deletions

View File

@ -183,6 +183,22 @@
fnext doctype;
}
# Machine for processing inline rules of a doctype.
doctype_inline := |*
^']'* $count_newlines => {
callback("on_doctype_inline", data, encoding, ts, te);
if ( lines > 0 )
{
advance_line(lines);
lines = 0;
}
};
']' => { fnext doctype; };
*|;
# Machine for processing doctypes. Doctype values such as the public
# and system IDs are treated as T_STRING tokens.
doctype := |*
@ -190,11 +206,8 @@
callback("on_doctype_type", data, encoding, ts, te);
};
# Consumes everything between the [ and ]. Due to the use of :> the ]
# is not consumed by any+.
'[' any+ :> ']' => {
callback("on_doctype_inline", data, encoding, ts + 1, te - 1);
};
# Starts a set of inline doctype rules.
'[' => { fnext doctype_inline; };
# Lex the public/system IDs as regular strings.
squote => start_string_squote;

View File

@ -86,12 +86,17 @@ rule
}
# <!DOCTYPE html [ ... ]>
| T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_INLINE T_DOCTYPE_END
| T_DOCTYPE_START T_DOCTYPE_NAME doctype_inline T_DOCTYPE_END
{
on_doctype(:name => val[1], :inline_rules => val[2])
}
;
doctype_inline
: T_DOCTYPE_INLINE { val[0] }
| doctype_inline T_DOCTYPE_INLINE { val[0] + val[1] }
;
# CDATA tags
cdata

View File

@ -49,6 +49,46 @@ describe Oga::XML::Lexer do
]
end
example 'lex an empty inline doctype' do
lex('<!DOCTYPE html []>').should == [
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'html', 1],
[:T_DOCTYPE_END, nil, 1]
]
end
example 'lex an inline doctype containing a newline' do
lex("<!DOCTYPE html [foo\n]>").should == [
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'html', 1],
[:T_DOCTYPE_INLINE, "foo\n", 1],
[:T_DOCTYPE_END, nil, 2]
]
end
example 'lex an inline doctype containing a trailing newline using an IO' do
input = StringIO.new("<!DOCTYPE html [foo\n]>")
lex(input).should == [
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'html', 1],
[:T_DOCTYPE_INLINE, "foo\n", 1],
[:T_DOCTYPE_END, nil, 2]
]
end
example 'lex an inline doctype containing a leading newline using an IO' do
input = StringIO.new("<!DOCTYPE html [\nfoo]>")
lex(input).should == [
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'html', 1],
[:T_DOCTYPE_INLINE, "\n", 1],
[:T_DOCTYPE_INLINE, "foo", 2],
[:T_DOCTYPE_END, nil, 2]
]
end
# Technically not valid, put in place to make sure that the Ragel rules are
# not too greedy.
example 'lex an inline doftype followed by a system ID' do

View File

@ -94,4 +94,14 @@ describe Oga::XML::Parser do
@document.doctype.inline_rules.should == '<!ELEMENT foo>'
end
end
context 'doctypes with inline rules and newlines using a StringIO' do
before :all do
@document = parse(StringIO.new("<!DOCTYPE html [\nfoo]>"))
end
example 'set the inline doctype rules' do
@document.doctype.inline_rules.should == "\nfoo"
end
end
end