Better support for lexing multi-line strings.

When lexing multi-line strings everything used to work fine as long as the input
were to be read as a whole. However, when using an IO instance all hell would
break loose. Due to the lexer reading IO instances on a per line basis,
sometimes Ragel would end up setting "ts" to NULL. For example, the following
input would break the lexer:

    <foo class="\nbar" />

Due to the input being read per line, the following data would be sent to the
lexer:

    <foo class="\n
    bar" />

This would result in different (or NULL) pointers being used for building a
string, in turn resulting in memory allocation errors.

To work around this the string lexing setup has been broken into separate
machines for single and double quoted strings. The tokens used have also been
changed so that instead of just "T_STRING" there are now the following tokens:

* T_STRING_SQUOTE
* T_STRING_DQUOTE
* T_STRING_BODY

A string can have multiple T_STRING_BODY tokens (= multi-line strings, only the
case for IO inputs). These strings are stitched back together by the parser.

This fixes #58.
This commit is contained in:
Yorick Peterse 2014-10-26 11:39:56 +01:00
parent fca88a69d1
commit 24ae791f00
9 changed files with 204 additions and 37 deletions

View File

@ -106,6 +106,8 @@
callback("on_text", data, encoding, mark, ts);
callback_simple("on_proc_ins_end");
mark = 0;
fnext main;
};
@ -121,15 +123,49 @@
dquote = '"';
squote = "'";
string_dquote = (dquote ^dquote* dquote);
string_squote = (squote ^squote* squote);
string = string_dquote | string_squote;
action emit_string {
callback("on_string", data, encoding, ts + 1, te - 1);
callback("on_string_body", data, encoding, ts, te);
if ( lines > 0 )
{
advance_line(lines);
lines = 0;
}
}
action start_string_squote {
callback_simple("on_string_squote");
fcall string_squote;
}
action start_string_dquote {
callback_simple("on_string_dquote");
fcall string_dquote;
}
string_squote := |*
^squote* $count_newlines => emit_string;
squote => {
callback_simple("on_string_squote");
fret;
};
*|;
string_dquote := |*
^dquote* $count_newlines => emit_string;
dquote => {
callback_simple("on_string_dquote");
fret;
};
*|;
# DOCTYPES
#
# http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
@ -161,7 +197,8 @@
};
# Lex the public/system IDs as regular strings.
string => emit_string;
squote => start_string_squote;
dquote => start_string_dquote;
# Whitespace inside doctypes is ignored since there's no point in
# including it.
@ -201,7 +238,8 @@
callback("on_attribute", data, encoding, ts, te);
};
string => emit_string;
squote => start_string_squote;
dquote => start_string_dquote;
any;
*|;
@ -259,7 +297,8 @@
};
# Attribute values.
string => emit_string;
squote => start_string_squote;
dquote => start_string_dquote;
# We're done with the open tag of the element.
'>' => {

View File

@ -182,12 +182,26 @@ module Oga
end
##
# Called when processing single/double quoted strings.
# Called when processing a single quote.
#
def on_string_squote
add_token(:T_STRING_SQUOTE)
end
##
# Called when processing a double quote.
#
def on_string_dquote
add_token(:T_STRING_DQUOTE)
end
##
# Called when processing the body of a string.
#
# @param [String] value The data between the quotes.
#
def on_string(value)
add_token(:T_STRING, Entities.decode(value))
def on_string_body(value)
add_token(:T_STRING_BODY, Entities.decode(value))
end
##

View File

@ -18,7 +18,7 @@
#
class Oga::XML::Parser
token T_STRING T_TEXT
token T_TEXT T_STRING_SQUOTE T_STRING_DQUOTE T_STRING_BODY
token T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME
token T_DOCTYPE_INLINE
token T_CDATA T_COMMENT
@ -69,13 +69,13 @@ rule
}
# <!DOCTYPE html PUBLIC "foo">
| T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_STRING T_DOCTYPE_END
| T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE string T_DOCTYPE_END
{
on_doctype(:name => val[1], :type => val[2], :public_id => val[3])
}
# <!DOCTYPE html PUBLIC "foo" "bar">
| T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE T_STRING T_STRING T_DOCTYPE_END
| T_DOCTYPE_START T_DOCTYPE_NAME T_DOCTYPE_TYPE string string T_DOCTYPE_END
{
on_doctype(
:name => val[1],
@ -161,7 +161,7 @@ rule
: attribute_name { val[0] }
# foo="bar"
| attribute_name T_STRING
| attribute_name string
{
val[0].value = val[1]
val[0]
@ -190,6 +190,28 @@ rule
text
: T_TEXT { on_text(val[0]) }
;
string
: string_dquote
| string_squote
;
# Single quoted strings
string_dquote
: T_STRING_DQUOTE T_STRING_DQUOTE { '' }
| T_STRING_DQUOTE string_body T_STRING_DQUOTE { val[1] }
;
# Double quoted strings
string_squote
: T_STRING_SQUOTE T_STRING_SQUOTE { '' }
| T_STRING_SQUOTE string_body T_STRING_SQUOTE { val[1] }
;
string_body
: T_STRING_BODY { val[0] }
| string_body T_STRING_BODY { val[0] + val[1] }
;
end
---- inner

View File

@ -15,8 +15,12 @@ describe Oga::XML::Lexer do
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'HTML', 1],
[:T_DOCTYPE_TYPE, 'PUBLIC', 1],
[:T_STRING, 'foobar', 1],
[:T_STRING, 'baz', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, 'foobar', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, 'baz', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_DOCTYPE_END, nil, 1]
]
end
@ -26,8 +30,12 @@ describe Oga::XML::Lexer do
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'HTML', 1],
[:T_DOCTYPE_TYPE, 'PUBLIC', 1],
[:T_STRING, 'foobar', 1],
[:T_STRING, 'baz', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_STRING_BODY, 'foobar', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_STRING_BODY, 'baz', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_DOCTYPE_END, nil, 1]
]
end
@ -48,7 +56,9 @@ describe Oga::XML::Lexer do
[:T_DOCTYPE_START, nil, 1],
[:T_DOCTYPE_NAME, 'html', 1],
[:T_DOCTYPE_INLINE, '<!ELEMENT foo>', 1],
[:T_STRING, 'foo', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, 'foo', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_DOCTYPE_END, nil, 1]
]
end

View File

@ -58,7 +58,8 @@ describe Oga::XML::Lexer do
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'p', 1],
[:T_ATTR, 'foo', 1],
[:T_STRING, '', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
@ -68,18 +69,35 @@ describe Oga::XML::Lexer do
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'p', 1],
[:T_ATTR, 'class', 1],
[:T_STRING, 'foo', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, 'foo', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_TEXT, 'Hello', 1],
[:T_ELEM_END, nil, 1]
]
end
example 'lex a paragraph element with a newline in an attribute' do
lex("<p class=\"\nfoo\">Hello</p>").should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'p', 1],
[:T_ATTR, 'class', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, "\nfoo", 1],
[:T_STRING_DQUOTE, nil, 2],
[:T_TEXT, 'Hello', 2],
[:T_ELEM_END, nil, 2]
]
end
example 'lex a paragraph element with single quoted attributes' do
lex("<p class='foo'></p>").should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'p', 1],
[:T_ATTR, 'class', 1],
[:T_STRING, 'foo', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_STRING_BODY, 'foo', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
@ -90,7 +108,9 @@ describe Oga::XML::Lexer do
[:T_ELEM_NAME, 'p', 1],
[:T_ATTR_NS, 'foo', 1],
[:T_ATTR, 'bar', 1],
[:T_STRING, 'baz', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, 'baz', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
@ -137,7 +157,9 @@ describe Oga::XML::Lexer do
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'br', 1],
[:T_ATTR, 'class', 1],
[:T_STRING, 'foo', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, 'foo', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end

View File

@ -21,7 +21,9 @@ describe Oga::XML::Lexer do
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'foo', 1],
[:T_ATTR, 'class', 1],
[:T_STRING, '&', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, '&', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
@ -31,7 +33,9 @@ describe Oga::XML::Lexer do
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'foo', 1],
[:T_ATTR, 'class', 1],
[:T_STRING, '<', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, '<', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
@ -41,7 +45,9 @@ describe Oga::XML::Lexer do
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'foo', 1],
[:T_ATTR, 'class', 1],
[:T_STRING, '>', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, '>', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_ELEM_END, nil, 1]
]
end

View File

@ -2,16 +2,16 @@ require 'spec_helper'
describe Oga::XML::Lexer do
context 'IO as input' do
before do
@io = StringIO.new("<p class='foo'>\nHello</p>")
end
example 'lex a paragraph element with attributes' do
lex(@io).should == [
io = StringIO.new("<p class='foo'>\nHello</p>")
lex(io).should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'p', 1],
[:T_ATTR, 'class', 1],
[:T_STRING, 'foo', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_STRING_BODY, 'foo', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_TEXT, "\n", 1],
[:T_TEXT, 'Hello', 2],
[:T_ELEM_END, nil, 2]
@ -19,10 +19,43 @@ describe Oga::XML::Lexer do
end
example 'rewind input when resetting the lexer' do
lexer = described_class.new(@io)
io = StringIO.new("<p class='foo'>\nHello</p>")
lexer = described_class.new(io)
lexer.lex.empty?.should == false
lexer.lex.empty?.should == false
end
example 'lex an attribute value starting with a newline' do
io = StringIO.new("<foo bar='\n10'></foo>")
lexer = described_class.new(io)
lexer.lex.should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'foo', 1],
[:T_ATTR, 'bar', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_STRING_BODY, "\n", 1],
[:T_STRING_BODY, "10", 2],
[:T_STRING_SQUOTE, nil, 2],
[:T_ELEM_END, nil, 2]
]
end
example 'lex an attribute value split in two by a newline' do
io = StringIO.new("<foo bar='foo\nbar'></foo>")
lexer = described_class.new(io)
lexer.lex.should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'foo', 1],
[:T_ATTR, 'bar', 1],
[:T_STRING_SQUOTE, nil, 1],
[:T_STRING_BODY, "foo\n", 1],
[:T_STRING_BODY, 'bar', 2],
[:T_STRING_SQUOTE, nil, 2],
[:T_ELEM_END, nil, 2]
]
end
end
end

View File

@ -17,7 +17,9 @@ describe Oga::XML::Lexer do
lex('<?xml version="1.0" ?>').should == [
[:T_XML_DECL_START, nil, 1],
[:T_ATTR, 'version', 1],
[:T_STRING, '1.0', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_STRING_BODY, '1.0', 1],
[:T_STRING_DQUOTE, nil, 1],
[:T_XML_DECL_END, nil, 1]
]
end

View File

@ -0,0 +1,19 @@
require 'spec_helper'
describe Oga::XML::Parser do
context 'IO as input' do
example 'parse an attribute starting with a newline' do
io = StringIO.new("<foo bar='\n10'></foo>")
doc = parse(io)
doc.children[0].attributes[0].value.should == "\n10"
end
example 'parse an attribute value split in two by a newline' do
io = StringIO.new("<foo bar='foo\nbar'></foo>")
doc = parse(io)
doc.children[0].attributes[0].value.should == "foo\nbar"
end
end
end