Support for inline doctype rules plus newlines.

This adds support for lexing/parsing XML documents that use an IO as input _and_
contain doctype rules with newlines in them.

This fixes #63.
This commit is contained in:
Yorick Peterse 2014-11-18 20:02:55 +01:00
parent f88df486ba
commit cbb2815146
4 changed files with 74 additions and 6 deletions

View File

@ -183,6 +183,22 @@
fnext doctype; fnext doctype;
} }
# Machine for processing inline rules of a doctype.
doctype_inline := |*
^']'* $count_newlines => {
callback("on_doctype_inline", data, encoding, ts, te);
if ( lines > 0 )
{
advance_line(lines);
lines = 0;
}
};
']' => { fnext doctype; };
*|;
# Machine for processing doctypes. Doctype values such as the public # Machine for processing doctypes. Doctype values such as the public
# and system IDs are treated as T_STRING tokens. # and system IDs are treated as T_STRING tokens.
doctype := |* doctype := |*
@ -190,11 +206,8 @@
callback("on_doctype_type", data, encoding, ts, te); callback("on_doctype_type", data, encoding, ts, te);
}; };
# Consumes everything between the [ and ]. Due to the use of :> the ] # Starts a set of inline doctype rules.
# is not consumed by any+. '[' => { fnext doctype_inline; };
'[' any+ :> ']' => {
callback("on_doctype_inline", data, encoding, ts + 1, te - 1);
};
# Lex the public/system IDs as regular strings. # Lex the public/system IDs as regular strings.
squote => start_string_squote; squote => start_string_squote;

View File

@ -86,12 +86,17 @@ rule
} }
# <!DOCTYPE html [ ... ]> # <!DOCTYPE html [ ... ]>
| T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_INLINE T_DOCTYPE_END | T_DOCTYPE_START T_DOCTYPE_NAME doctype_inline T_DOCTYPE_END
{ {
on_doctype(:name => val[1], :inline_rules => val[2]) on_doctype(:name => val[1], :inline_rules => val[2])
} }
; ;
doctype_inline
: T_DOCTYPE_INLINE { val[0] }
| doctype_inline T_DOCTYPE_INLINE { val[0] + val[1] }
;
# CDATA tags # CDATA tags
cdata cdata

View File

@ -49,6 +49,46 @@ describe Oga::XML::Lexer do
] ]
end end
example 'lex an empty inline doctype' do
lex('<!DOCTYPE html []>').should == [
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'html', 1],
[:T_DOCTYPE_END, nil, 1]
]
end
example 'lex an inline doctype containing a newline' do
lex("<!DOCTYPE html [foo\n]>").should == [
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'html', 1],
[:T_DOCTYPE_INLINE, "foo\n", 1],
[:T_DOCTYPE_END, nil, 2]
]
end
example 'lex an inline doctype containing a trailing newline using an IO' do
input = StringIO.new("<!DOCTYPE html [foo\n]>")
lex(input).should == [
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'html', 1],
[:T_DOCTYPE_INLINE, "foo\n", 1],
[:T_DOCTYPE_END, nil, 2]
]
end
example 'lex an inline doctype containing a leading newline using an IO' do
input = StringIO.new("<!DOCTYPE html [\nfoo]>")
lex(input).should == [
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'html', 1],
[:T_DOCTYPE_INLINE, "\n", 1],
[:T_DOCTYPE_INLINE, "foo", 2],
[:T_DOCTYPE_END, nil, 2]
]
end
# Technically not valid, put in place to make sure that the Ragel rules are # Technically not valid, put in place to make sure that the Ragel rules are
# not too greedy. # not too greedy.
example 'lex an inline doftype followed by a system ID' do example 'lex an inline doftype followed by a system ID' do

View File

@ -94,4 +94,14 @@ describe Oga::XML::Parser do
@document.doctype.inline_rules.should == '<!ELEMENT foo>' @document.doctype.inline_rules.should == '<!ELEMENT foo>'
end end
end end
context 'doctypes with inline rules and newlines using a StringIO' do
before :all do
@document = parse(StringIO.new("<!DOCTYPE html [\nfoo]>"))
end
example 'set the inline doctype rules' do
@document.doctype.inline_rules.should == "\nfoo"
end
end
end end