Better support for lexing multi-line strings.
When lexing multi-line strings everything used to work fine as long as the input were to be read as a whole. However, when using an IO instance all hell would break loose. Due to the lexer reading IO instances on a per line basis, sometimes Ragel would end up setting "ts" to NULL. For example, the following input would break the lexer: <foo class="\nbar" /> Due to the input being read per line, the following data would be sent to the lexer: <foo class="\n bar" /> This would result in different (or NULL) pointers being used for building a string, in turn resulting in memory allocation errors. To work around this the string lexing setup has been broken into separate machines for single and double quoted strings. The tokens used have also been changed so that instead of just "T_STRING" there are now the following tokens: * T_STRING_SQUOTE * T_STRING_DQUOTE * T_STRING_BODY A string can have multiple T_STRING_BODY tokens (= multi-line strings, only the case for IO inputs). These strings are stitched back together by the parser. This fixes #58.
This commit is contained in:
parent
fca88a69d1
commit
24ae791f00
|
@ -106,6 +106,8 @@
|
|||
callback("on_text", data, encoding, mark, ts);
|
||||
callback_simple("on_proc_ins_end");
|
||||
|
||||
mark = 0;
|
||||
|
||||
fnext main;
|
||||
};
|
||||
|
||||
|
@ -121,15 +123,49 @@
|
|||
dquote = '"';
|
||||
squote = "'";
|
||||
|
||||
string_dquote = (dquote ^dquote* dquote);
|
||||
string_squote = (squote ^squote* squote);
|
||||
|
||||
string = string_dquote | string_squote;
|
||||
|
||||
action emit_string {
|
||||
callback("on_string", data, encoding, ts + 1, te - 1);
|
||||
callback("on_string_body", data, encoding, ts, te);
|
||||
|
||||
if ( lines > 0 )
|
||||
{
|
||||
advance_line(lines);
|
||||
|
||||
lines = 0;
|
||||
}
|
||||
}
|
||||
|
||||
action start_string_squote {
|
||||
callback_simple("on_string_squote");
|
||||
|
||||
fcall string_squote;
|
||||
}
|
||||
|
||||
action start_string_dquote {
|
||||
callback_simple("on_string_dquote");
|
||||
|
||||
fcall string_dquote;
|
||||
}
|
||||
|
||||
string_squote := |*
|
||||
^squote* $count_newlines => emit_string;
|
||||
|
||||
squote => {
|
||||
callback_simple("on_string_squote");
|
||||
|
||||
fret;
|
||||
};
|
||||
*|;
|
||||
|
||||
string_dquote := |*
|
||||
^dquote* $count_newlines => emit_string;
|
||||
|
||||
dquote => {
|
||||
callback_simple("on_string_dquote");
|
||||
|
||||
fret;
|
||||
};
|
||||
*|;
|
||||
|
||||
# DOCTYPES
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
|
||||
|
@ -161,7 +197,8 @@
|
|||
};
|
||||
|
||||
# Lex the public/system IDs as regular strings.
|
||||
string => emit_string;
|
||||
squote => start_string_squote;
|
||||
dquote => start_string_dquote;
|
||||
|
||||
# Whitespace inside doctypes is ignored since there's no point in
|
||||
# including it.
|
||||
|
@ -201,7 +238,8 @@
|
|||
callback("on_attribute", data, encoding, ts, te);
|
||||
};
|
||||
|
||||
string => emit_string;
|
||||
squote => start_string_squote;
|
||||
dquote => start_string_dquote;
|
||||
|
||||
any;
|
||||
*|;
|
||||
|
@ -259,7 +297,8 @@
|
|||
};
|
||||
|
||||
# Attribute values.
|
||||
string => emit_string;
|
||||
squote => start_string_squote;
|
||||
dquote => start_string_dquote;
|
||||
|
||||
# We're done with the open tag of the element.
|
||||
'>' => {
|
||||
|
|
|
@ -182,12 +182,26 @@ module Oga
|
|||
end
|
||||
|
||||
##
|
||||
# Called when processing single/double quoted strings.
|
||||
# Called when processing a single quote.
|
||||
#
|
||||
def on_string_squote
|
||||
add_token(:T_STRING_SQUOTE)
|
||||
end
|
||||
|
||||
##
|
||||
# Called when processing a double quote.
|
||||
#
|
||||
def on_string_dquote
|
||||
add_token(:T_STRING_DQUOTE)
|
||||
end
|
||||
|
||||
##
|
||||
# Called when processing the body of a string.
|
||||
#
|
||||
# @param [String] value The data between the quotes.
|
||||
#
|
||||
def on_string(value)
|
||||
add_token(:T_STRING, Entities.decode(value))
|
||||
def on_string_body(value)
|
||||
add_token(:T_STRING_BODY, Entities.decode(value))
|
||||
end
|
||||
|
||||
##
|
||||
|
|
|
@ -18,7 +18,7 @@
|
|||
#
|
||||
class Oga::XML::Parser
|
||||
|
||||
token T_STRING T_TEXT
|
||||
token T_TEXT T_STRING_SQUOTE T_STRING_DQUOTE T_STRING_BODY
|
||||
token T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME
|
||||
token T_DOCTYPE_INLINE
|
||||
token T_CDATA T_COMMENT
|
||||
|
@ -69,13 +69,13 @@ rule
|
|||
}
|
||||
|
||||
# <!DOCTYPE html PUBLIC "foo">
|
||||
| T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_STRING T_DOCTYPE_END
|
||||
| T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE string T_DOCTYPE_END
|
||||
{
|
||||
on_doctype(:name => val[1], :type => val[2], :public_id => val[3])
|
||||
}
|
||||
|
||||
# <!DOCTYPE html PUBLIC "foo" "bar">
|
||||
| T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_STRING T_STRING T_DOCTYPE_END
|
||||
| T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE string string T_DOCTYPE_END
|
||||
{
|
||||
on_doctype(
|
||||
:name => val[1],
|
||||
|
@ -161,7 +161,7 @@ rule
|
|||
: attribute_name { val[0] }
|
||||
|
||||
# foo="bar"
|
||||
| attribute_name T_STRING
|
||||
| attribute_name string
|
||||
{
|
||||
val[0].value = val[1]
|
||||
val[0]
|
||||
|
@ -190,6 +190,28 @@ rule
|
|||
text
|
||||
: T_TEXT { on_text(val[0]) }
|
||||
;
|
||||
|
||||
string
|
||||
: string_dquote
|
||||
| string_squote
|
||||
;
|
||||
|
||||
# Single quoted strings
|
||||
string_dquote
|
||||
: T_STRING_DQUOTE T_STRING_DQUOTE { '' }
|
||||
| T_STRING_DQUOTE string_body T_STRING_DQUOTE { val[1] }
|
||||
;
|
||||
|
||||
# Double quoted strings
|
||||
string_squote
|
||||
: T_STRING_SQUOTE T_STRING_SQUOTE { '' }
|
||||
| T_STRING_SQUOTE string_body T_STRING_SQUOTE { val[1] }
|
||||
;
|
||||
|
||||
string_body
|
||||
: T_STRING_BODY { val[0] }
|
||||
| string_body T_STRING_BODY { val[0] + val[1] }
|
||||
;
|
||||
end
|
||||
|
||||
---- inner
|
||||
|
|
|
@ -15,8 +15,12 @@ describe Oga::XML::Lexer do
|
|||
[:T_DOCTYPE_START, nil, 1],
|
||||
[:T_DOCTYPE_NAME, 'HTML', 1],
|
||||
[:T_DOCTYPE_TYPE, 'PUBLIC', 1],
|
||||
[:T_STRING, 'foobar', 1],
|
||||
[:T_STRING, 'baz', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, 'foobar', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, 'baz', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_DOCTYPE_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
@ -26,8 +30,12 @@ describe Oga::XML::Lexer do
|
|||
[:T_DOCTYPE_START, nil, 1],
|
||||
[:T_DOCTYPE_NAME, 'HTML', 1],
|
||||
[:T_DOCTYPE_TYPE, 'PUBLIC', 1],
|
||||
[:T_STRING, 'foobar', 1],
|
||||
[:T_STRING, 'baz', 1],
|
||||
[:T_STRING_SQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, 'foobar', 1],
|
||||
[:T_STRING_SQUOTE, nil, 1],
|
||||
[:T_STRING_SQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, 'baz', 1],
|
||||
[:T_STRING_SQUOTE, nil, 1],
|
||||
[:T_DOCTYPE_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
@ -48,7 +56,9 @@ describe Oga::XML::Lexer do
|
|||
[:T_DOCTYPE_START, nil, 1],
|
||||
[:T_DOCTYPE_NAME, 'html', 1],
|
||||
[:T_DOCTYPE_INLINE, '<!ELEMENT foo>', 1],
|
||||
[:T_STRING, 'foo', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, 'foo', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_DOCTYPE_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
|
|
@ -58,7 +58,8 @@ describe Oga::XML::Lexer do
|
|||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'p', 1],
|
||||
[:T_ATTR, 'foo', 1],
|
||||
[:T_STRING, '', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
@ -68,18 +69,35 @@ describe Oga::XML::Lexer do
|
|||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'p', 1],
|
||||
[:T_ATTR, 'class', 1],
|
||||
[:T_STRING, 'foo', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, 'foo', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_TEXT, 'Hello', 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex a paragraph element with a newline in an attribute' do
|
||||
lex("<p class=\"\nfoo\">Hello</p>").should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'p', 1],
|
||||
[:T_ATTR, 'class', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, "\nfoo", 1],
|
||||
[:T_STRING_DQUOTE, nil, 2],
|
||||
[:T_TEXT, 'Hello', 2],
|
||||
[:T_ELEM_END, nil, 2]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex a paragraph element with single quoted attributes' do
|
||||
lex("<p class='foo'></p>").should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'p', 1],
|
||||
[:T_ATTR, 'class', 1],
|
||||
[:T_STRING, 'foo', 1],
|
||||
[:T_STRING_SQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, 'foo', 1],
|
||||
[:T_STRING_SQUOTE, nil, 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
@ -90,7 +108,9 @@ describe Oga::XML::Lexer do
|
|||
[:T_ELEM_NAME, 'p', 1],
|
||||
[:T_ATTR_NS, 'foo', 1],
|
||||
[:T_ATTR, 'bar', 1],
|
||||
[:T_STRING, 'baz', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, 'baz', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
@ -137,7 +157,9 @@ describe Oga::XML::Lexer do
|
|||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'br', 1],
|
||||
[:T_ATTR, 'class', 1],
|
||||
[:T_STRING, 'foo', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, 'foo', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
|
|
@ -21,7 +21,9 @@ describe Oga::XML::Lexer do
|
|||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'foo', 1],
|
||||
[:T_ATTR, 'class', 1],
|
||||
[:T_STRING, '&', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, '&', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
@ -31,7 +33,9 @@ describe Oga::XML::Lexer do
|
|||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'foo', 1],
|
||||
[:T_ATTR, 'class', 1],
|
||||
[:T_STRING, '<', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, '<', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
@ -41,7 +45,9 @@ describe Oga::XML::Lexer do
|
|||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'foo', 1],
|
||||
[:T_ATTR, 'class', 1],
|
||||
[:T_STRING, '>', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, '>', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
|
|
@ -2,16 +2,16 @@ require 'spec_helper'
|
|||
|
||||
describe Oga::XML::Lexer do
|
||||
context 'IO as input' do
|
||||
before do
|
||||
@io = StringIO.new("<p class='foo'>\nHello</p>")
|
||||
end
|
||||
|
||||
example 'lex a paragraph element with attributes' do
|
||||
lex(@io).should == [
|
||||
io = StringIO.new("<p class='foo'>\nHello</p>")
|
||||
|
||||
lex(io).should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'p', 1],
|
||||
[:T_ATTR, 'class', 1],
|
||||
[:T_STRING, 'foo', 1],
|
||||
[:T_STRING_SQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, 'foo', 1],
|
||||
[:T_STRING_SQUOTE, nil, 1],
|
||||
[:T_TEXT, "\n", 1],
|
||||
[:T_TEXT, 'Hello', 2],
|
||||
[:T_ELEM_END, nil, 2]
|
||||
|
@ -19,10 +19,43 @@ describe Oga::XML::Lexer do
|
|||
end
|
||||
|
||||
example 'rewind input when resetting the lexer' do
|
||||
lexer = described_class.new(@io)
|
||||
io = StringIO.new("<p class='foo'>\nHello</p>")
|
||||
lexer = described_class.new(io)
|
||||
|
||||
lexer.lex.empty?.should == false
|
||||
lexer.lex.empty?.should == false
|
||||
end
|
||||
|
||||
example 'lex an attribute value starting with a newline' do
|
||||
io = StringIO.new("<foo bar='\n10'></foo>")
|
||||
lexer = described_class.new(io)
|
||||
|
||||
lexer.lex.should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'foo', 1],
|
||||
[:T_ATTR, 'bar', 1],
|
||||
[:T_STRING_SQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, "\n", 1],
|
||||
[:T_STRING_BODY, "10", 2],
|
||||
[:T_STRING_SQUOTE, nil, 2],
|
||||
[:T_ELEM_END, nil, 2]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex an attribute value split in two by a newline' do
|
||||
io = StringIO.new("<foo bar='foo\nbar'></foo>")
|
||||
lexer = described_class.new(io)
|
||||
|
||||
lexer.lex.should == [
|
||||
[:T_ELEM_START, nil, 1],
|
||||
[:T_ELEM_NAME, 'foo', 1],
|
||||
[:T_ATTR, 'bar', 1],
|
||||
[:T_STRING_SQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, "foo\n", 1],
|
||||
[:T_STRING_BODY, 'bar', 2],
|
||||
[:T_STRING_SQUOTE, nil, 2],
|
||||
[:T_ELEM_END, nil, 2]
|
||||
]
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -17,7 +17,9 @@ describe Oga::XML::Lexer do
|
|||
lex('<?xml version="1.0" ?>').should == [
|
||||
[:T_XML_DECL_START, nil, 1],
|
||||
[:T_ATTR, 'version', 1],
|
||||
[:T_STRING, '1.0', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_STRING_BODY, '1.0', 1],
|
||||
[:T_STRING_DQUOTE, nil, 1],
|
||||
[:T_XML_DECL_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::XML::Parser do
|
||||
context 'IO as input' do
|
||||
example 'parse an attribute starting with a newline' do
|
||||
io = StringIO.new("<foo bar='\n10'></foo>")
|
||||
doc = parse(io)
|
||||
|
||||
doc.children[0].attributes[0].value.should == "\n10"
|
||||
end
|
||||
|
||||
example 'parse an attribute value split in two by a newline' do
|
||||
io = StringIO.new("<foo bar='foo\nbar'></foo>")
|
||||
doc = parse(io)
|
||||
|
||||
doc.children[0].attributes[0].value.should == "foo\nbar"
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue