Lex processing instructions in chunks

Similar to comments (ea8b4aa92f) and CDATA
tags (8acc7fc743) processing instructions
are now lexed in separate chunks _with_ proper support for streaming
input.

Related issue: #93
This commit is contained in:
Yorick Peterse 2015-04-15 00:11:57 +02:00
parent ea8b4aa92f
commit b2ea20ba61
6 changed files with 118 additions and 13 deletions

View File

@ -109,6 +109,7 @@ VALUE oga_xml_lexer_advance(VALUE self, VALUE data_block)
ID id_on_proc_ins_end = rb_intern("on_proc_ins_end");
ID id_on_proc_ins_name = rb_intern("on_proc_ins_name");
ID id_on_proc_ins_start = rb_intern("on_proc_ins_start");
ID id_on_proc_ins_body = rb_intern("on_proc_ins_body");
ID id_on_string_body = rb_intern("on_string_body");
ID id_on_string_dquote = rb_intern("on_string_dquote");
ID id_on_string_squote = rb_intern("on_string_squote");

View File

@ -123,6 +123,7 @@ public class Lexer extends RubyObject
String id_on_proc_ins_end = "on_proc_ins_end";
String id_on_proc_ins_name = "on_proc_ins_name";
String id_on_proc_ins_start = "on_proc_ins_start";
String id_on_proc_ins_body = "on_proc_ins_body";
String id_on_string_body = "on_string_body";
String id_on_string_dquote = "on_string_dquote";
String id_on_string_squote = "on_string_squote";

View File

@ -149,26 +149,33 @@
proc_ins_start = '<?' identifier;
proc_ins_end = '?>';
# Everything except "?" OR a single "?"
proc_ins_allowed = (^'?'+ | '?') $count_newlines;
action start_proc_ins {
callback_simple(id_on_proc_ins_start);
callback(id_on_proc_ins_name, data, encoding, ts + 2, te);
mark = te;
fnext proc_ins_body;
}
proc_ins_body := |*
proc_ins_end => {
callback(id_on_text, data, encoding, mark, ts);
callback_simple(id_on_proc_ins_end);
proc_ins_allowed => {
callback(id_on_proc_ins_body, data, encoding, ts, te);
mark = 0;
if ( lines > 0 )
{
advance_line(lines);
lines = 0;
}
};
proc_ins_end => {
callback_simple(id_on_proc_ins_end);
fnext main;
};
any;
*|;
# Strings

View File

@ -344,6 +344,15 @@ module Oga
add_token(:T_PROC_INS_NAME, value)
end
##
# Called on the body of a processing instruction.
#
# @param [String] value
#
def on_proc_ins_body(value)
add_token(:T_PROC_INS_BODY, value)
end
##
# Called on the end of a processing instruction.
#

View File

@ -29,7 +29,7 @@
%terminals T_CDATA_START T_CDATA_BODY T_CDATA_END;
%terminals T_ELEM_START T_ELEM_NAME T_ELEM_NS T_ELEM_END T_ATTR T_ATTR_NS;
%terminals T_XML_DECL_START T_XML_DECL_END;
%terminals T_PROC_INS_START T_PROC_INS_NAME T_PROC_INS_END;
%terminals T_PROC_INS_START T_PROC_INS_NAME T_PROC_INS_BODY T_PROC_INS_END;
document
= expressions { on_document(val[0]) }
@ -117,12 +117,17 @@ comment_body
# Processing Instructions
proc_ins
= T_PROC_INS_START T_PROC_INS_NAME T_TEXT? T_PROC_INS_END
= T_PROC_INS_START T_PROC_INS_NAME proc_ins_body T_PROC_INS_END
{
on_proc_ins(val[1], val[2])
}
;
proc_ins_body
= T_PROC_INS_BODY proc_ins_body { val[0] + val[1] }
| _ { '' }
;
# Elements
element_name_ns

View File

@ -2,7 +2,7 @@ require 'spec_helper'
describe Oga::XML::Lexer do
describe 'processing instructions' do
it 'lexes a processing instruction' do
it 'lexes an instruction' do
lex('<?foo?>').should == [
[:T_PROC_INS_START, nil, 1],
[:T_PROC_INS_NAME, 'foo', 1],
@ -10,13 +10,95 @@ describe Oga::XML::Lexer do
]
end
it 'lexes a processing instruction containing text' do
it 'lexes an instruction containing text' do
lex('<?foo bar ?>').should == [
[:T_PROC_INS_START, nil, 1],
[:T_PROC_INS_NAME, 'foo', 1],
[:T_TEXT, ' bar ', 1],
[:T_PROC_INS_BODY, ' bar ', 1],
[:T_PROC_INS_END, nil, 1]
]
end
it 'lexes an instruction containing a ?' do
lex('<?foo ? ?>').should == [
[:T_PROC_INS_START, nil, 1],
[:T_PROC_INS_NAME, 'foo', 1],
[:T_PROC_INS_BODY, ' ', 1],
[:T_PROC_INS_BODY, '?', 1],
[:T_PROC_INS_BODY, ' ', 1],
[:T_PROC_INS_END, nil, 1]
]
end
it 'lexes two instructions following each other' do
lex('<?foo bar ?><?foo baz ?>').should == [
[:T_PROC_INS_START, nil, 1],
[:T_PROC_INS_NAME, 'foo', 1],
[:T_PROC_INS_BODY, ' bar ', 1],
[:T_PROC_INS_END, nil, 1],
[:T_PROC_INS_START, nil, 1],
[:T_PROC_INS_NAME, 'foo', 1],
[:T_PROC_INS_BODY, ' baz ', 1],
[:T_PROC_INS_END, nil, 1]
]
end
it 'lexes an instruction with a newline after the name' do
lex("<?foo\nbar?>").should == [
[:T_PROC_INS_START, nil, 1],
[:T_PROC_INS_NAME, 'foo', 1],
[:T_PROC_INS_BODY, "\nbar", 1],
[:T_PROC_INS_END, nil, 2]
]
end
it 'lexes an instruction with a newline before the closing tag' do
lex("<?foo bar\n?>").should == [
[:T_PROC_INS_START, nil, 1],
[:T_PROC_INS_NAME, 'foo', 1],
[:T_PROC_INS_BODY, " bar\n", 1],
[:T_PROC_INS_END, nil, 2]
]
end
it 'lexes an instruction with the body surrounded by newlines' do
lex("<?foo\nbar\n?>").should == [
[:T_PROC_INS_START, nil, 1],
[:T_PROC_INS_NAME, 'foo', 1],
[:T_PROC_INS_BODY, "\nbar\n", 1],
[:T_PROC_INS_END, nil, 3]
]
end
describe 'using an IO as input' do
it 'lexes an instruction with a newline after the name' do
lex_stringio("<?foo\nbar?>").should == [
[:T_PROC_INS_START, nil, 1],
[:T_PROC_INS_NAME, 'foo', 1],
[:T_PROC_INS_BODY, "\n", 1],
[:T_PROC_INS_BODY, "bar", 2],
[:T_PROC_INS_END, nil, 2]
]
end
it 'lexes an instruction with a newline before the closing tag' do
lex_stringio("<?foo bar\n?>").should == [
[:T_PROC_INS_START, nil, 1],
[:T_PROC_INS_NAME, 'foo', 1],
[:T_PROC_INS_BODY, " bar\n", 1],
[:T_PROC_INS_END, nil, 2]
]
end
it 'lexes an instruction with the body surrounded by newlines' do
lex_stringio("<?foo\nbar\n?>").should == [
[:T_PROC_INS_START, nil, 1],
[:T_PROC_INS_NAME, 'foo', 1],
[:T_PROC_INS_BODY, "\n", 1],
[:T_PROC_INS_BODY, "bar\n", 2],
[:T_PROC_INS_END, nil, 3]
]
end
end
end
end