Lex CDATA tags in chunks

Instead of using a single token (T_CDATA) for a CDATA tag the lexer now
uses 3 tokens:

1. T_CDATA_START
2. T_CDATA_BODY
3. T_CDATA_END

The T_CDATA_BODY token can occur multiple times and is turned into a
single value in the XML parser. This is similar to the way strings are
lexed.

By changing the way CDATA tags are lexed Oga can now lex CDATA tags
containing newlines when using an IO as input. For example, this would
previously fail:

    Oga.parse_xml(StringIO.new("<![CDATA[\nfoo]]>"))

Because IO input reads input per line the input for the lexer would be
as following:

    "<![CDATA[\n"
    "foo]]>"

Related issues: #93
This commit is contained in:
Yorick Peterse 2015-04-14 22:45:55 +02:00
parent 739e3b474c
commit 8acc7fc743
8 changed files with 153 additions and 18 deletions

View File

@ -90,7 +90,9 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
ID id_advance_line = rb_intern("advance_line");
ID id_on_attribute = rb_intern("on_attribute");
ID id_on_attribute_ns = rb_intern("on_attribute_ns");
ID id_on_cdata = rb_intern("on_cdata");
ID id_on_cdata_start = rb_intern("on_cdata_start");
ID id_on_cdata_body = rb_intern("on_cdata_body");
ID id_on_cdata_end = rb_intern("on_cdata_end");
ID id_on_comment = rb_intern("on_comment");
ID id_on_doctype_end = rb_intern("on_doctype_end");
ID id_on_doctype_inline = rb_intern("on_doctype_inline");

View File

@ -104,7 +104,9 @@ public class Lexer extends RubyObject
String id_advance_line = "advance_line";
String id_on_attribute = "on_attribute";
String id_on_attribute_ns = "on_attribute_ns";
String id_on_cdata = "on_cdata";
String id_on_cdata_start = "on_cdata_start";
String id_on_cdata_body = "on_cdata_body";
String id_on_cdata_end = "on_cdata_end";
String id_on_comment = "on_comment";
String id_on_doctype_end = "on_doctype_end";
String id_on_doctype_inline = "on_doctype_inline";

View File

@ -83,12 +83,35 @@
cdata_start = '<![CDATA[';
cdata_end = ']]>';
cdata = cdata_start (any* -- cdata_end) cdata_end;
# Everything except "]" OR a single "]"
cdata_allowed = (^']'+ | ']') $count_newlines;
action start_cdata {
callback(id_on_cdata, data, encoding, ts + 9, te - 3);
callback_simple(id_on_cdata_start);
fnext cdata_body;
}
cdata_body := |*
cdata_allowed => {
callback(id_on_cdata_body, data, encoding, ts, te);
if ( lines > 0 )
{
advance_line(lines);
lines = 0;
}
};
cdata_end => {
callback_simple(id_on_cdata_end);
fnext main;
};
*|;
# Processing Instructions
#
# http://www.w3.org/TR/xpath/#section-Processing-Instruction-Nodes
@ -439,7 +462,7 @@
doctype_start => start_doctype;
xml_decl_start => start_xml_decl;
comment => start_comment;
cdata => start_cdata;
cdata_start => start_cdata;
proc_ins_start => start_proc_ins;
element_start => start_element;
element_end => close_element;

View File

@ -262,10 +262,26 @@ module Oga
end
##
# Called on a CDATA tag.
# Called on the open CDATA tag.
#
def on_cdata(value)
add_token(:T_CDATA, value)
def on_cdata_start
add_token(:T_CDATA_START)
end
##
# Called on the closing CDATA tag.
#
def on_cdata_end
add_token(:T_CDATA_END)
end
##
# Called for the body of a CDATA tag.
#
# @param [String] value
#
def on_cdata_body(value)
add_token(:T_CDATA_BODY, value)
end
##

View File

@ -24,7 +24,8 @@
%terminals T_TEXT T_STRING_SQUOTE T_STRING_DQUOTE T_STRING_BODY;
%terminals T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE T_DOCTYPE_NAME;
%terminals T_DOCTYPE_INLINE T_CDATA T_COMMENT;
%terminals T_DOCTYPE_INLINE T_COMMENT;
%terminals T_CDATA_START T_CDATA_BODY T_CDATA_END;
%terminals T_ELEM_START T_ELEM_NAME T_ELEM_NS T_ELEM_END T_ATTR T_ATTR_NS;
%terminals T_XML_DECL_START T_XML_DECL_END;
%terminals T_PROC_INS_START T_PROC_INS_NAME T_PROC_INS_END;
@ -93,7 +94,12 @@ doctype_types
# CDATA tags
cdata
= T_CDATA { on_cdata(val[0]) }
= T_CDATA_START cdata_body T_CDATA_END { on_cdata(val[1]) }
;
cdata_body
= T_CDATA_BODY cdata_body { val[0] + val[1] }
| _ { '' }
;
# Comments

View File

@ -1,30 +1,107 @@
require 'spec_helper'
describe Oga::XML::Lexer do
describe 'cdata tags' do
it 'lexes a cdata tag' do
lex('<![CDATA[foo]]>').should == [[:T_CDATA, 'foo', 1]]
describe 'CDATA tags' do
it 'lexes a CDATA tag' do
lex('<![CDATA[foo]]>').should == [
[:T_CDATA_START, nil, 1],
[:T_CDATA_BODY, 'foo', 1],
[:T_CDATA_END, nil, 1]
]
end
it 'lexes tags inside CDATA tags as regular text' do
lex('<![CDATA[<p>Foo</p>]]>').should == [[:T_CDATA, '<p>Foo</p>', 1]]
lex('<![CDATA[<p>Foo</p>]]>').should == [
[:T_CDATA_START, nil, 1],
[:T_CDATA_BODY, '<p>Foo</p>', 1],
[:T_CDATA_END, nil, 1]
]
end
it 'lexes a single bracket inside a CDATA tag' do
lex('<![CDATA[]]]>').should == [
[:T_CDATA_START, nil, 1],
[:T_CDATA_BODY, ']', 1],
[:T_CDATA_END, nil, 1]
]
end
it 'lexes double brackets inside a CDATA tag' do
lex('<![CDATA[]]]]>').should == [[:T_CDATA, ']]', 1]]
lex('<![CDATA[]]]]>').should == [
[:T_CDATA_START, nil, 1],
[:T_CDATA_BODY, ']', 1],
[:T_CDATA_BODY, ']', 1],
[:T_CDATA_END, nil, 1]
]
end
it 'lexes two CDATA tags following each other' do
lex('<a><![CDATA[foo]]><b><![CDATA[bar]]></b></a>').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'a', 1],
[:T_CDATA, 'foo', 1],
[:T_CDATA_START, nil, 1],
[:T_CDATA_BODY, 'foo', 1],
[:T_CDATA_END, nil, 1],
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'b', 1],
[:T_CDATA, 'bar', 1],
[:T_CDATA_START, nil, 1],
[:T_CDATA_BODY, 'bar', 1],
[:T_CDATA_END, nil, 1],
[:T_ELEM_END, nil, 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes a CDATA tag containing a newline after the open tag' do
lex("<![CDATA[\nfoo]]>").should == [
[:T_CDATA_START, nil, 1],
[:T_CDATA_BODY, "\nfoo", 1],
[:T_CDATA_END, nil, 2]
]
end
it 'lexes a CDATA tag containing a newline before the closing tag' do
lex("<![CDATA[foo\n]]>").should == [
[:T_CDATA_START, nil, 1],
[:T_CDATA_BODY, "foo\n", 1],
[:T_CDATA_END, nil, 2]
]
end
it 'lexes a CDATA tag with the body surrounded by newlines' do
lex("<![CDATA[\nfoo\n]]>").should == [
[:T_CDATA_START, nil, 1],
[:T_CDATA_BODY, "\nfoo\n", 1],
[:T_CDATA_END, nil, 3]
]
end
describe 'using an IO as input' do
it 'lexes a CDATA tag containing a newline after the open tag' do
lex_stringio("<![CDATA[\nfoo]]>").should == [
[:T_CDATA_START, nil, 1],
[:T_CDATA_BODY, "\n", 1],
[:T_CDATA_BODY, "foo", 2],
[:T_CDATA_END, nil, 2]
]
end
it 'lexes a CDATA tag containing a newline before the closing tag' do
lex_stringio("<![CDATA[foo\n]]>").should == [
[:T_CDATA_START, nil, 1],
[:T_CDATA_BODY, "foo\n", 1],
[:T_CDATA_END, nil, 2]
]
end
it 'lexes a CDATA tag with the body surrounded by newlines' do
lex_stringio("<![CDATA[\nfoo\n]]>").should == [
[:T_CDATA_START, nil, 1],
[:T_CDATA_BODY, "\n", 1],
[:T_CDATA_BODY, "foo\n", 2],
[:T_CDATA_END, nil, 3]
]
end
end
end
end

View File

@ -30,7 +30,9 @@ describe Oga::XML::Lexer do
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'script', 1],
[:T_TEXT, @javascript, 1],
[:T_CDATA, 'foo', 1],
[:T_CDATA_START, nil, 1],
[:T_CDATA_BODY, 'foo', 1],
[:T_CDATA_END, nil, 1],
[:T_ELEM_END, nil, 1]
]
end

View File

@ -29,6 +29,13 @@ module Oga
return Oga::XML::Lexer.new(input, options).lex
end
##
# @see [#lex]
#
def lex_stringio(input, options = {})
return lex(StringIO.new(input), options)
end
##
# Lexes an XPath expression.
#