From cbb2815146a79805b8da483d2ef48d17e2959e72 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Tue, 18 Nov 2014 20:02:55 +0100 Subject: [PATCH] Support for inline doctype rules plus newlines. This adds support for lexing/parsing XML documents that use an IO as input _and_ contain doctype rules with newlines in them. This fixes #63. --- ext/ragel/base_lexer.rl | 23 +++++++++++++---- lib/oga/xml/parser.y | 7 ++++- spec/oga/xml/lexer/doctype_spec.rb | 40 +++++++++++++++++++++++++++++ spec/oga/xml/parser/doctype_spec.rb | 10 ++++++++ 4 files changed, 74 insertions(+), 6 deletions(-) diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index 562de55..5a9c5c7 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -183,6 +183,22 @@ fnext doctype; } + # Machine for processing inline rules of a doctype. + doctype_inline := |* + ^']'* $count_newlines => { + callback("on_doctype_inline", data, encoding, ts, te); + + if ( lines > 0 ) + { + advance_line(lines); + + lines = 0; + } + }; + + ']' => { fnext doctype; }; + *|; + # Machine for processing doctypes. Doctype values such as the public # and system IDs are treated as T_STRING tokens. doctype := |* @@ -190,11 +206,8 @@ callback("on_doctype_type", data, encoding, ts, te); }; - # Consumes everything between the [ and ]. Due to the use of :> the ] - # is not consumed by any+. - '[' any+ :> ']' => { - callback("on_doctype_inline", data, encoding, ts + 1, te - 1); - }; + # Starts a set of inline doctype rules. + '[' => { fnext doctype_inline; }; # Lex the public/system IDs as regular strings. squote => start_string_squote; diff --git a/lib/oga/xml/parser.y b/lib/oga/xml/parser.y index 38f31b3..92f19fc 100644 --- a/lib/oga/xml/parser.y +++ b/lib/oga/xml/parser.y @@ -86,12 +86,17 @@ rule } # - | T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_INLINE T_DOCTYPE_END + | T_DOCTYPE_START T_DOCTYPE_NAME doctype_inline T_DOCTYPE_END { on_doctype(:name => val[1], :inline_rules => val[2]) } ; + doctype_inline + : T_DOCTYPE_INLINE { val[0] } + | doctype_inline T_DOCTYPE_INLINE { val[0] + val[1] } + ; + # CDATA tags cdata diff --git a/spec/oga/xml/lexer/doctype_spec.rb b/spec/oga/xml/lexer/doctype_spec.rb index 95e52e8..f38470d 100644 --- a/spec/oga/xml/lexer/doctype_spec.rb +++ b/spec/oga/xml/lexer/doctype_spec.rb @@ -49,6 +49,46 @@ describe Oga::XML::Lexer do ] end + example 'lex an empty inline doctype' do + lex('').should == [ + [:T_DOCTYPE_START, nil, 1], + [:T_DOCTYPE_NAME, 'html', 1], + [:T_DOCTYPE_END, nil, 1] + ] + end + + example 'lex an inline doctype containing a newline' do + lex("").should == [ + [:T_DOCTYPE_START, nil, 1], + [:T_DOCTYPE_NAME, 'html', 1], + [:T_DOCTYPE_INLINE, "foo\n", 1], + [:T_DOCTYPE_END, nil, 2] + ] + end + + example 'lex an inline doctype containing a trailing newline using an IO' do + input = StringIO.new("") + + lex(input).should == [ + [:T_DOCTYPE_START, nil, 1], + [:T_DOCTYPE_NAME, 'html', 1], + [:T_DOCTYPE_INLINE, "foo\n", 1], + [:T_DOCTYPE_END, nil, 2] + ] + end + + example 'lex an inline doctype containing a leading newline using an IO' do + input = StringIO.new("") + + lex(input).should == [ + [:T_DOCTYPE_START, nil, 1], + [:T_DOCTYPE_NAME, 'html', 1], + [:T_DOCTYPE_INLINE, "\n", 1], + [:T_DOCTYPE_INLINE, "foo", 2], + [:T_DOCTYPE_END, nil, 2] + ] + end + # Technically not valid, put in place to make sure that the Ragel rules are # not too greedy. example 'lex an inline doftype followed by a system ID' do diff --git a/spec/oga/xml/parser/doctype_spec.rb b/spec/oga/xml/parser/doctype_spec.rb index bf0ef03..9bf4fa7 100644 --- a/spec/oga/xml/parser/doctype_spec.rb +++ b/spec/oga/xml/parser/doctype_spec.rb @@ -94,4 +94,14 @@ describe Oga::XML::Parser do @document.doctype.inline_rules.should == '' end end + + context 'doctypes with inline rules and newlines using a StringIO' do + before :all do + @document = parse(StringIO.new("")) + end + + example 'set the inline doctype rules' do + @document.doctype.inline_rules.should == "\nfoo" + end + end end