diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index 28dd337..562de55 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -106,6 +106,8 @@ callback("on_text", data, encoding, mark, ts); callback_simple("on_proc_ins_end"); + mark = 0; + fnext main; }; @@ -121,15 +123,49 @@ dquote = '"'; squote = "'"; - string_dquote = (dquote ^dquote* dquote); - string_squote = (squote ^squote* squote); - - string = string_dquote | string_squote; - action emit_string { - callback("on_string", data, encoding, ts + 1, te - 1); + callback("on_string_body", data, encoding, ts, te); + + if ( lines > 0 ) + { + advance_line(lines); + + lines = 0; + } } + action start_string_squote { + callback_simple("on_string_squote"); + + fcall string_squote; + } + + action start_string_dquote { + callback_simple("on_string_dquote"); + + fcall string_dquote; + } + + string_squote := |* + ^squote* $count_newlines => emit_string; + + squote => { + callback_simple("on_string_squote"); + + fret; + }; + *|; + + string_dquote := |* + ^dquote* $count_newlines => emit_string; + + dquote => { + callback_simple("on_string_dquote"); + + fret; + }; + *|; + # DOCTYPES # # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax @@ -161,7 +197,8 @@ }; # Lex the public/system IDs as regular strings. - string => emit_string; + squote => start_string_squote; + dquote => start_string_dquote; # Whitespace inside doctypes is ignored since there's no point in # including it. @@ -201,7 +238,8 @@ callback("on_attribute", data, encoding, ts, te); }; - string => emit_string; + squote => start_string_squote; + dquote => start_string_dquote; any; *|; @@ -259,7 +297,8 @@ }; # Attribute values. - string => emit_string; + squote => start_string_squote; + dquote => start_string_dquote; # We're done with the open tag of the element. '>' => { diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb index b28a5ae..fa1757c 100644 --- a/lib/oga/xml/lexer.rb +++ b/lib/oga/xml/lexer.rb @@ -182,12 +182,26 @@ module Oga end ## - # Called when processing single/double quoted strings. + # Called when processing a single quote. + # + def on_string_squote + add_token(:T_STRING_SQUOTE) + end + + ## + # Called when processing a double quote. + # + def on_string_dquote + add_token(:T_STRING_DQUOTE) + end + + ## + # Called when processing the body of a string. # # @param [String] value The data between the quotes. # - def on_string(value) - add_token(:T_STRING, Entities.decode(value)) + def on_string_body(value) + add_token(:T_STRING_BODY, Entities.decode(value)) end ## diff --git a/lib/oga/xml/parser.y b/lib/oga/xml/parser.y index d946988..38f31b3 100644 --- a/lib/oga/xml/parser.y +++ b/lib/oga/xml/parser.y @@ -18,7 +18,7 @@ # class Oga::XML::Parser -token T_STRING T_TEXT +token T_TEXT T_STRING_SQUOTE T_STRING_DQUOTE T_STRING_BODY token T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME token T_DOCTYPE_INLINE token T_CDATA T_COMMENT @@ -69,13 +69,13 @@ rule } # - | T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_STRING T_DOCTYPE_END + | T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE string T_DOCTYPE_END { on_doctype(:name => val[1], :type => val[2], :public_id => val[3]) } # - | T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_STRING T_STRING T_DOCTYPE_END + | T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE string string T_DOCTYPE_END { on_doctype( :name => val[1], @@ -161,7 +161,7 @@ rule : attribute_name { val[0] } # foo="bar" - | attribute_name T_STRING + | attribute_name string { val[0].value = val[1] val[0] @@ -190,6 +190,28 @@ rule text : T_TEXT { on_text(val[0]) } ; + + string + : string_dquote + | string_squote + ; + + # Single quoted strings + string_dquote + : T_STRING_DQUOTE T_STRING_DQUOTE { '' } + | T_STRING_DQUOTE string_body T_STRING_DQUOTE { val[1] } + ; + + # Double quoted strings + string_squote + : T_STRING_SQUOTE T_STRING_SQUOTE { '' } + | T_STRING_SQUOTE string_body T_STRING_SQUOTE { val[1] } + ; + + string_body + : T_STRING_BODY { val[0] } + | string_body T_STRING_BODY { val[0] + val[1] } + ; end ---- inner diff --git a/spec/oga/xml/lexer/doctype_spec.rb b/spec/oga/xml/lexer/doctype_spec.rb index 89a9bb0..95e52e8 100644 --- a/spec/oga/xml/lexer/doctype_spec.rb +++ b/spec/oga/xml/lexer/doctype_spec.rb @@ -15,8 +15,12 @@ describe Oga::XML::Lexer do [:T_DOCTYPE_START, nil, 1], [:T_DOCTYPE_NAME, 'HTML', 1], [:T_DOCTYPE_TYPE, 'PUBLIC', 1], - [:T_STRING, 'foobar', 1], - [:T_STRING, 'baz', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, 'foobar', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, 'baz', 1], + [:T_STRING_DQUOTE, nil, 1], [:T_DOCTYPE_END, nil, 1] ] end @@ -26,8 +30,12 @@ describe Oga::XML::Lexer do [:T_DOCTYPE_START, nil, 1], [:T_DOCTYPE_NAME, 'HTML', 1], [:T_DOCTYPE_TYPE, 'PUBLIC', 1], - [:T_STRING, 'foobar', 1], - [:T_STRING, 'baz', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_STRING_BODY, 'foobar', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_STRING_BODY, 'baz', 1], + [:T_STRING_SQUOTE, nil, 1], [:T_DOCTYPE_END, nil, 1] ] end @@ -48,7 +56,9 @@ describe Oga::XML::Lexer do [:T_DOCTYPE_START, nil, 1], [:T_DOCTYPE_NAME, 'html', 1], [:T_DOCTYPE_INLINE, '', 1], - [:T_STRING, 'foo', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, 'foo', 1], + [:T_STRING_DQUOTE, nil, 1], [:T_DOCTYPE_END, nil, 1] ] end diff --git a/spec/oga/xml/lexer/elements_spec.rb b/spec/oga/xml/lexer/elements_spec.rb index 3455e2b..079bbd4 100644 --- a/spec/oga/xml/lexer/elements_spec.rb +++ b/spec/oga/xml/lexer/elements_spec.rb @@ -58,7 +58,8 @@ describe Oga::XML::Lexer do [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'p', 1], [:T_ATTR, 'foo', 1], - [:T_STRING, '', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_DQUOTE, nil, 1], [:T_ELEM_END, nil, 1] ] end @@ -68,18 +69,35 @@ describe Oga::XML::Lexer do [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'p', 1], [:T_ATTR, 'class', 1], - [:T_STRING, 'foo', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, 'foo', 1], + [:T_STRING_DQUOTE, nil, 1], [:T_TEXT, 'Hello', 1], [:T_ELEM_END, nil, 1] ] end + example 'lex a paragraph element with a newline in an attribute' do + lex("
Hello
").should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'p', 1], + [:T_ATTR, 'class', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, "\nfoo", 1], + [:T_STRING_DQUOTE, nil, 2], + [:T_TEXT, 'Hello', 2], + [:T_ELEM_END, nil, 2] + ] + end + example 'lex a paragraph element with single quoted attributes' do lex("").should == [ [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'p', 1], [:T_ATTR, 'class', 1], - [:T_STRING, 'foo', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_STRING_BODY, 'foo', 1], + [:T_STRING_SQUOTE, nil, 1], [:T_ELEM_END, nil, 1] ] end @@ -90,7 +108,9 @@ describe Oga::XML::Lexer do [:T_ELEM_NAME, 'p', 1], [:T_ATTR_NS, 'foo', 1], [:T_ATTR, 'bar', 1], - [:T_STRING, 'baz', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, 'baz', 1], + [:T_STRING_DQUOTE, nil, 1], [:T_ELEM_END, nil, 1] ] end @@ -137,7 +157,9 @@ describe Oga::XML::Lexer do [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'br', 1], [:T_ATTR, 'class', 1], - [:T_STRING, 'foo', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, 'foo', 1], + [:T_STRING_DQUOTE, nil, 1], [:T_ELEM_END, nil, 1] ] end diff --git a/spec/oga/xml/lexer/entities_spec.rb b/spec/oga/xml/lexer/entities_spec.rb index e5ba251..3dd2021 100644 --- a/spec/oga/xml/lexer/entities_spec.rb +++ b/spec/oga/xml/lexer/entities_spec.rb @@ -21,7 +21,9 @@ describe Oga::XML::Lexer do [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'foo', 1], [:T_ATTR, 'class', 1], - [:T_STRING, '&', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, '&', 1], + [:T_STRING_DQUOTE, nil, 1], [:T_ELEM_END, nil, 1] ] end @@ -31,7 +33,9 @@ describe Oga::XML::Lexer do [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'foo', 1], [:T_ATTR, 'class', 1], - [:T_STRING, '<', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, '<', 1], + [:T_STRING_DQUOTE, nil, 1], [:T_ELEM_END, nil, 1] ] end @@ -41,7 +45,9 @@ describe Oga::XML::Lexer do [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'foo', 1], [:T_ATTR, 'class', 1], - [:T_STRING, '>', 1], + [:T_STRING_DQUOTE, nil, 1], + [:T_STRING_BODY, '>', 1], + [:T_STRING_DQUOTE, nil, 1], [:T_ELEM_END, nil, 1] ] end diff --git a/spec/oga/xml/lexer/io_spec.rb b/spec/oga/xml/lexer/io_spec.rb index 66c8f63..2fe3a52 100644 --- a/spec/oga/xml/lexer/io_spec.rb +++ b/spec/oga/xml/lexer/io_spec.rb @@ -2,16 +2,16 @@ require 'spec_helper' describe Oga::XML::Lexer do context 'IO as input' do - before do - @io = StringIO.new("\nHello
") - end - example 'lex a paragraph element with attributes' do - lex(@io).should == [ + io = StringIO.new("\nHello
") + + lex(io).should == [ [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'p', 1], [:T_ATTR, 'class', 1], - [:T_STRING, 'foo', 1], + [:T_STRING_SQUOTE, nil, 1], + [:T_STRING_BODY, 'foo', 1], + [:T_STRING_SQUOTE, nil, 1], [:T_TEXT, "\n", 1], [:T_TEXT, 'Hello', 2], [:T_ELEM_END, nil, 2] @@ -19,10 +19,43 @@ describe Oga::XML::Lexer do end example 'rewind input when resetting the lexer' do - lexer = described_class.new(@io) + io = StringIO.new("\nHello
") + lexer = described_class.new(io) lexer.lex.empty?.should == false lexer.lex.empty?.should == false end + + example 'lex an attribute value starting with a newline' do + io = StringIO.new("