From 24ae791f00380763134654a88c9d2b7a09168a95 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Sun, 26 Oct 2014 11:39:56 +0100 Subject: [PATCH] Better support for lexing multi-line strings. When lexing multi-line strings everything used to work fine as long as the input were to be read as a whole. However, when using an IO instance all hell would break loose. Due to the lexer reading IO instances on a per line basis, sometimes Ragel would end up setting "ts" to NULL. For example, the following input would break the lexer: Due to the input being read per line, the following data would be sent to the lexer: This would result in different (or NULL) pointers being used for building a string, in turn resulting in memory allocation errors. To work around this the string lexing setup has been broken into separate machines for single and double quoted strings. The tokens used have also been changed so that instead of just "T_STRING" there are now the following tokens: * T_STRING_SQUOTE * T_STRING_DQUOTE * T_STRING_BODY A string can have multiple T_STRING_BODY tokens (= multi-line strings, only the case for IO inputs). These strings are stitched back together by the parser. This fixes #58. --- ext/ragel/base_lexer.rl | 57 ++++++++++++++++++---- lib/oga/xml/lexer.rb | 20 ++++++-- lib/oga/xml/parser.y | 30 ++++++++++-- spec/oga/xml/lexer/doctype_spec.rb | 20 ++++++-- spec/oga/xml/lexer/elements_spec.rb | 32 ++++++++++-- spec/oga/xml/lexer/entities_spec.rb | 12 +++-- spec/oga/xml/lexer/io_spec.rb | 47 +++++++++++++++--- spec/oga/xml/lexer/xml_declaration_spec.rb | 4 +- spec/oga/xml/parser/io_spec.rb | 19 ++++++++ 9 files changed, 204 insertions(+), 37 deletions(-) create mode 100644 spec/oga/xml/parser/io_spec.rb diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index 28dd337..562de55 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -106,6 +106,8 @@ callback("on_text", data, encoding, mark, ts); callback_simple("on_proc_ins_end"); + mark = 0; + fnext main; }; @@ -121,15 +123,49 @@ dquote = '"'; squote = "'"; - string_dquote = (dquote ^dquote* dquote); - string_squote = (squote ^squote* squote); - - string = string_dquote | string_squote; - action emit_string { - callback("on_string", data, encoding, ts + 1, te - 1); + callback("on_string_body", data, encoding, ts, te); + + if ( lines > 0 ) + { + advance_line(lines); + + lines = 0; + } } + action start_string_squote { + callback_simple("on_string_squote"); + + fcall string_squote; + } + + action start_string_dquote { + callback_simple("on_string_dquote"); + + fcall string_dquote; + } + + string_squote := |* + ^squote* $count_newlines => emit_string; + + squote => { + callback_simple("on_string_squote"); + + fret; + }; + *|; + + string_dquote := |* + ^dquote* $count_newlines => emit_string; + + dquote => { + callback_simple("on_string_dquote"); + + fret; + }; + *|; + # DOCTYPES # # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax @@ -161,7 +197,8 @@ }; # Lex the public/system IDs as regular strings. - string => emit_string; + squote => start_string_squote; + dquote => start_string_dquote; # Whitespace inside doctypes is ignored since there's no point in # including it. @@ -201,7 +238,8 @@ callback("on_attribute", data, encoding, ts, te); }; - string => emit_string; + squote => start_string_squote; + dquote => start_string_dquote; any; *|; @@ -259,7 +297,8 @@ }; # Attribute values. - string => emit_string; + squote => start_string_squote; + dquote => start_string_dquote; # We're done with the open tag of the element. '>' => { diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb index b28a5ae..fa1757c 100644 --- a/lib/oga/xml/lexer.rb +++ b/lib/oga/xml/lexer.rb @@ -182,12 +182,26 @@ module Oga end ## - # Called when processing single/double quoted strings. + # Called when processing a single quote. + # + def on_string_squote + add_token(:T_STRING_SQUOTE) + end + + ## + # Called when processing a double quote. + # + def on_string_dquote + add_token(:T_STRING_DQUOTE) + end + + ## + # Called when processing the body of a string. # # @param [String] value The data between the quotes. # - def on_string(value) - add_token(:T_STRING, Entities.decode(value)) + def on_string_body(value) + add_token(:T_STRING_BODY, Entities.decode(value)) end ## diff --git a/lib/oga/xml/parser.y b/lib/oga/xml/parser.y index d946988..38f31b3 100644 --- a/lib/oga/xml/parser.y +++ b/lib/oga/xml/parser.y @@ -18,7 +18,7 @@ # class Oga::XML::Parser -token T_STRING T_TEXT +token T_TEXT T_STRING_SQUOTE T_STRING_DQUOTE T_STRING_BODY token T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME token T_DOCTYPE_INLINE token T_CDATA T_COMMENT @@ -69,13 +69,13 @@ rule } # - | T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_STRING T_DOCTYPE_END + | T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE string T_DOCTYPE_END { on_doctype(:name => val[1], :type => val[2], :public_id => val[3]) } # - | T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_STRING T_STRING T_DOCTYPE_END + | T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE string string T_DOCTYPE_END { on_doctype( :name => val[1], @@ -161,7 +161,7 @@ rule : attribute_name { val[0] } # foo="bar" - | attribute_name T_STRING + | attribute_name string { val[0].value = val[1] val[0] @@ -190,6 +190,28 @@ rule text : T_TEXT { on_text(val[0]) } ; + + string + : string_dquote + | string_squote + ; + + # Single quoted strings + string_dquote + : T_STRING_DQUOTE T_STRING_DQUOTE { '' } + | T_STRING_DQUOTE string_body T_STRING_DQUOTE { val[1] } + ; + + # Double quoted strings + string_squote + : T_STRING_SQUOTE T_STRING_SQUOTE { '' } + | T_STRING_SQUOTE string_body T_STRING_SQUOTE { val[1] } + ; + + string_body + : T_STRING_BODY { val[0] } + | string_body T_STRING_BODY { val[0] + val[1] } + ; end ---- inner diff --git a/spec/oga/xml/lexer/doctype_spec.rb b/spec/oga/xml/lexer/doctype_spec.rb index 89a9bb0..95e52e8 100644 --- a/spec/oga/xml/lexer/doctype_spec.rb +++ b/spec/oga/xml/lexer/doctype_spec.rb @@ -15,8 +15,12 @@ describe Oga::XML::Lexer do [:T_DOCTYPE_START, nil, 1], [:T_DOCTYPE_NAME, 'HTML', 1], [:T_DOCTYPE_TYPE, 'PUBLIC', 1], - [:T_STRING, 'foobar', 1], - [:T_STRING, 'baz', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, 'foobar', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, 'baz', 1], + [:T_STRING_DQUOTE, nil, 1], [:T_DOCTYPE_END, nil, 1] ] end @@ -26,8 +30,12 @@ describe Oga::XML::Lexer do [:T_DOCTYPE_START, nil, 1], [:T_DOCTYPE_NAME, 'HTML', 1], [:T_DOCTYPE_TYPE, 'PUBLIC', 1], - [:T_STRING, 'foobar', 1], - [:T_STRING, 'baz', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_STRING_BODY, 'foobar', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_STRING_BODY, 'baz', 1], + [:T_STRING_SQUOTE, nil, 1], [:T_DOCTYPE_END, nil, 1] ] end @@ -48,7 +56,9 @@ describe Oga::XML::Lexer do [:T_DOCTYPE_START, nil, 1], [:T_DOCTYPE_NAME, 'html', 1], [:T_DOCTYPE_INLINE, '', 1], - [:T_STRING, 'foo', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, 'foo', 1], + [:T_STRING_DQUOTE, nil, 1], [:T_DOCTYPE_END, nil, 1] ] end diff --git a/spec/oga/xml/lexer/elements_spec.rb b/spec/oga/xml/lexer/elements_spec.rb index 3455e2b..079bbd4 100644 --- a/spec/oga/xml/lexer/elements_spec.rb +++ b/spec/oga/xml/lexer/elements_spec.rb @@ -58,7 +58,8 @@ describe Oga::XML::Lexer do [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'p', 1], [:T_ATTR, 'foo', 1], - [:T_STRING, '', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_DQUOTE, nil, 1], [:T_ELEM_END, nil, 1] ] end @@ -68,18 +69,35 @@ describe Oga::XML::Lexer do [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'p', 1], [:T_ATTR, 'class', 1], - [:T_STRING, 'foo', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, 'foo', 1], + [:T_STRING_DQUOTE, nil, 1], [:T_TEXT, 'Hello', 1], [:T_ELEM_END, nil, 1] ] end + example 'lex a paragraph element with a newline in an attribute' do + lex("

Hello

").should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'p', 1], + [:T_ATTR, 'class', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, "\nfoo", 1], + [:T_STRING_DQUOTE, nil, 2], + [:T_TEXT, 'Hello', 2], + [:T_ELEM_END, nil, 2] + ] + end + example 'lex a paragraph element with single quoted attributes' do lex("

").should == [ [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'p', 1], [:T_ATTR, 'class', 1], - [:T_STRING, 'foo', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_STRING_BODY, 'foo', 1], + [:T_STRING_SQUOTE, nil, 1], [:T_ELEM_END, nil, 1] ] end @@ -90,7 +108,9 @@ describe Oga::XML::Lexer do [:T_ELEM_NAME, 'p', 1], [:T_ATTR_NS, 'foo', 1], [:T_ATTR, 'bar', 1], - [:T_STRING, 'baz', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, 'baz', 1], + [:T_STRING_DQUOTE, nil, 1], [:T_ELEM_END, nil, 1] ] end @@ -137,7 +157,9 @@ describe Oga::XML::Lexer do [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'br', 1], [:T_ATTR, 'class', 1], - [:T_STRING, 'foo', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, 'foo', 1], + [:T_STRING_DQUOTE, nil, 1], [:T_ELEM_END, nil, 1] ] end diff --git a/spec/oga/xml/lexer/entities_spec.rb b/spec/oga/xml/lexer/entities_spec.rb index e5ba251..3dd2021 100644 --- a/spec/oga/xml/lexer/entities_spec.rb +++ b/spec/oga/xml/lexer/entities_spec.rb @@ -21,7 +21,9 @@ describe Oga::XML::Lexer do [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'foo', 1], [:T_ATTR, 'class', 1], - [:T_STRING, '&', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, '&', 1], + [:T_STRING_DQUOTE, nil, 1], [:T_ELEM_END, nil, 1] ] end @@ -31,7 +33,9 @@ describe Oga::XML::Lexer do [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'foo', 1], [:T_ATTR, 'class', 1], - [:T_STRING, '<', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, '<', 1], + [:T_STRING_DQUOTE, nil, 1], [:T_ELEM_END, nil, 1] ] end @@ -41,7 +45,9 @@ describe Oga::XML::Lexer do [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'foo', 1], [:T_ATTR, 'class', 1], - [:T_STRING, '>', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, '>', 1], + [:T_STRING_DQUOTE, nil, 1], [:T_ELEM_END, nil, 1] ] end diff --git a/spec/oga/xml/lexer/io_spec.rb b/spec/oga/xml/lexer/io_spec.rb index 66c8f63..2fe3a52 100644 --- a/spec/oga/xml/lexer/io_spec.rb +++ b/spec/oga/xml/lexer/io_spec.rb @@ -2,16 +2,16 @@ require 'spec_helper' describe Oga::XML::Lexer do context 'IO as input' do - before do - @io = StringIO.new("

\nHello

") - end - example 'lex a paragraph element with attributes' do - lex(@io).should == [ + io = StringIO.new("

\nHello

") + + lex(io).should == [ [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'p', 1], [:T_ATTR, 'class', 1], - [:T_STRING, 'foo', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_STRING_BODY, 'foo', 1], + [:T_STRING_SQUOTE, nil, 1], [:T_TEXT, "\n", 1], [:T_TEXT, 'Hello', 2], [:T_ELEM_END, nil, 2] @@ -19,10 +19,43 @@ describe Oga::XML::Lexer do end example 'rewind input when resetting the lexer' do - lexer = described_class.new(@io) + io = StringIO.new("

\nHello

") + lexer = described_class.new(io) lexer.lex.empty?.should == false lexer.lex.empty?.should == false end + + example 'lex an attribute value starting with a newline' do + io = StringIO.new("") + lexer = described_class.new(io) + + lexer.lex.should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'foo', 1], + [:T_ATTR, 'bar', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_STRING_BODY, "\n", 1], + [:T_STRING_BODY, "10", 2], + [:T_STRING_SQUOTE, nil, 2], + [:T_ELEM_END, nil, 2] + ] + end + + example 'lex an attribute value split in two by a newline' do + io = StringIO.new("") + lexer = described_class.new(io) + + lexer.lex.should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'foo', 1], + [:T_ATTR, 'bar', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_STRING_BODY, "foo\n", 1], + [:T_STRING_BODY, 'bar', 2], + [:T_STRING_SQUOTE, nil, 2], + [:T_ELEM_END, nil, 2] + ] + end end end diff --git a/spec/oga/xml/lexer/xml_declaration_spec.rb b/spec/oga/xml/lexer/xml_declaration_spec.rb index 35a6a86..8459592 100644 --- a/spec/oga/xml/lexer/xml_declaration_spec.rb +++ b/spec/oga/xml/lexer/xml_declaration_spec.rb @@ -17,7 +17,9 @@ describe Oga::XML::Lexer do lex('').should == [ [:T_XML_DECL_START, nil, 1], [:T_ATTR, 'version', 1], - [:T_STRING, '1.0', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, '1.0', 1], + [:T_STRING_DQUOTE, nil, 1], [:T_XML_DECL_END, nil, 1] ] end diff --git a/spec/oga/xml/parser/io_spec.rb b/spec/oga/xml/parser/io_spec.rb new file mode 100644 index 0000000..fd1fa11 --- /dev/null +++ b/spec/oga/xml/parser/io_spec.rb @@ -0,0 +1,19 @@ +require 'spec_helper' + +describe Oga::XML::Parser do + context 'IO as input' do + example 'parse an attribute starting with a newline' do + io = StringIO.new("") + doc = parse(io) + + doc.children[0].attributes[0].value.should == "\n10" + end + + example 'parse an attribute value split in two by a newline' do + io = StringIO.new("") + doc = parse(io) + + doc.children[0].attributes[0].value.should == "foo\nbar" + end + end +end