Use separate Ragel machines for script/style tags
Previously a single Ragel machine was used for processing HTML script and style tags. This had the unfortunate side-effect that the following was not parsed correctly (while being valid HTML): <script> var foo = "</style>"; </script> The same applied to style tags: <style> /* </script> */ </style> By using separate machines we can work around the above issue. The downside is that this can produce multiple T_TEXT nodes, which have to be stitched back together in the parser.
This commit is contained in:
parent
2d43e459a1
commit
73fbbfbdbd
|
@ -19,11 +19,15 @@ on `ts` and `te`) so the macro ignores this argument.
|
||||||
#define advance_line(amount) \
|
#define advance_line(amount) \
|
||||||
rb_funcall(self, id_advance_line, 1, INT2NUM(amount));
|
rb_funcall(self, id_advance_line, 1, INT2NUM(amount));
|
||||||
|
|
||||||
#define literal_html_element_p() \
|
#define html_script_p() \
|
||||||
rb_funcall(self, id_literal_html_element_p, 0) == Qtrue
|
rb_funcall(self, id_html_script_p, 0) == Qtrue
|
||||||
|
|
||||||
|
#define html_style_p() \
|
||||||
|
rb_funcall(self, id_html_style_p, 0) == Qtrue
|
||||||
|
|
||||||
ID id_advance_line;
|
ID id_advance_line;
|
||||||
ID id_literal_html_element_p;
|
ID id_html_script_p;
|
||||||
|
ID id_html_style_p;
|
||||||
ID id_html;
|
ID id_html;
|
||||||
|
|
||||||
%%machine c_lexer;
|
%%machine c_lexer;
|
||||||
|
@ -183,9 +187,10 @@ void Init_liboga_xml_lexer()
|
||||||
VALUE mXML = rb_const_get(mOga, rb_intern("XML"));
|
VALUE mXML = rb_const_get(mOga, rb_intern("XML"));
|
||||||
VALUE cLexer = rb_define_class_under(mXML, "Lexer", rb_cObject);
|
VALUE cLexer = rb_define_class_under(mXML, "Lexer", rb_cObject);
|
||||||
|
|
||||||
id_advance_line = rb_intern("advance_line");
|
id_advance_line = rb_intern("advance_line");
|
||||||
id_literal_html_element_p = rb_intern("literal_html_element?");
|
id_html_script_p = rb_intern("html_script?");
|
||||||
id_html = rb_intern("html");
|
id_html_style_p = rb_intern("html_style?");
|
||||||
|
id_html = rb_intern("html");
|
||||||
|
|
||||||
rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1);
|
rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1);
|
||||||
rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0);
|
rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0);
|
||||||
|
|
|
@ -194,13 +194,23 @@ public class Lexer extends RubyObject
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* See * Oga::XML::Lexer#literal_html_element? for more information.
|
* @see Oga::XML::Lexer#html_script?
|
||||||
*/
|
*/
|
||||||
public Boolean literal_html_element_p()
|
public Boolean html_script_p()
|
||||||
{
|
{
|
||||||
ThreadContext context = this.runtime.getCurrentContext();
|
ThreadContext context = this.runtime.getCurrentContext();
|
||||||
|
|
||||||
return this.callMethod(context, "literal_html_element?").isTrue();
|
return this.callMethod(context, "html_script?").isTrue();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @see Oga::XML::Lexer#html_style?
|
||||||
|
*/
|
||||||
|
public Boolean html_style_p()
|
||||||
|
{
|
||||||
|
ThreadContext context = this.runtime.getCurrentContext();
|
||||||
|
|
||||||
|
return this.callMethod(context, "html_style?").isTrue();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -58,7 +58,7 @@
|
||||||
}
|
}
|
||||||
|
|
||||||
action advance_newline {
|
action advance_newline {
|
||||||
advance_line(1)
|
advance_line(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
action hold_and_return {
|
action hold_and_return {
|
||||||
|
@ -376,6 +376,12 @@
|
||||||
callback_simple(id_on_element_end);
|
callback_simple(id_on_element_end);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
action close_element_fnext_main {
|
||||||
|
callback_simple(id_on_element_end);
|
||||||
|
|
||||||
|
fnext main;
|
||||||
|
}
|
||||||
|
|
||||||
# Machine used for lexing the name/namespace of an element.
|
# Machine used for lexing the name/namespace of an element.
|
||||||
element_name := |*
|
element_name := |*
|
||||||
identifier ':' => {
|
identifier ':' => {
|
||||||
|
@ -465,9 +471,13 @@
|
||||||
'>' => {
|
'>' => {
|
||||||
callback_simple(id_on_element_open_end);
|
callback_simple(id_on_element_open_end);
|
||||||
|
|
||||||
if ( literal_html_element_p() )
|
if ( html_script_p() )
|
||||||
{
|
{
|
||||||
fnext literal_html_element;
|
fnext html_script;
|
||||||
|
}
|
||||||
|
else if ( html_style_p() )
|
||||||
|
{
|
||||||
|
fnext html_style;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -506,6 +516,17 @@
|
||||||
terminate_text = '</' | '<!' | '<?' | element_start;
|
terminate_text = '</' | '<!' | '<?' | element_start;
|
||||||
allowed_text = (any* -- terminate_text) $count_newlines;
|
allowed_text = (any* -- terminate_text) $count_newlines;
|
||||||
|
|
||||||
|
action emit_text {
|
||||||
|
callback(id_on_text, data, encoding, ts, te);
|
||||||
|
|
||||||
|
if ( lines > 0 )
|
||||||
|
{
|
||||||
|
advance_line(lines);
|
||||||
|
|
||||||
|
lines = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
text := |*
|
text := |*
|
||||||
terminate_text | allowed_text => {
|
terminate_text | allowed_text => {
|
||||||
callback(id_on_text, data, encoding, ts, te);
|
callback(id_on_text, data, encoding, ts, te);
|
||||||
|
@ -541,36 +562,17 @@
|
||||||
# Certain tags in HTML can contain basically anything except for the literal
|
# Certain tags in HTML can contain basically anything except for the literal
|
||||||
# closing tag. Two examples are script and style tags. As a result of this
|
# closing tag. Two examples are script and style tags. As a result of this
|
||||||
# we can't use the regular text machine.
|
# we can't use the regular text machine.
|
||||||
literal_html_closing_tags = '</script>' | '</style>';
|
|
||||||
literal_html_allowed = (any* -- literal_html_closing_tags) $count_newlines;
|
|
||||||
|
|
||||||
literal_html_element := |*
|
literal_html_allowed = (^'<'+ | '<'+) $count_newlines;
|
||||||
literal_html_allowed => {
|
|
||||||
callback(id_on_text, data, encoding, ts, te);
|
|
||||||
|
|
||||||
if ( lines > 0 )
|
html_script := |*
|
||||||
{
|
literal_html_allowed => emit_text;
|
||||||
advance_line(lines);
|
'</script>' => close_element_fnext_main;
|
||||||
|
*|;
|
||||||
|
|
||||||
lines = 0;
|
html_style := |*
|
||||||
}
|
literal_html_allowed => emit_text;
|
||||||
};
|
'</style>' => close_element_fnext_main;
|
||||||
|
|
||||||
literal_html_allowed %{ mark = p; } literal_html_closing_tags => {
|
|
||||||
callback(id_on_text, data, encoding, ts, mark);
|
|
||||||
|
|
||||||
p = mark - 1;
|
|
||||||
mark = 0;
|
|
||||||
|
|
||||||
if ( lines > 0 )
|
|
||||||
{
|
|
||||||
advance_line(lines);
|
|
||||||
|
|
||||||
lines = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
fnext main;
|
|
||||||
};
|
|
||||||
*|;
|
*|;
|
||||||
|
|
||||||
# The main machine aka the entry point of Ragel.
|
# The main machine aka the entry point of Ragel.
|
||||||
|
|
|
@ -40,12 +40,18 @@ module Oga
|
||||||
class Lexer
|
class Lexer
|
||||||
attr_reader :html
|
attr_reader :html
|
||||||
|
|
||||||
|
# @return [String]
|
||||||
|
HTML_SCRIPT = 'script'
|
||||||
|
|
||||||
|
# @return [String]
|
||||||
|
HTML_STYLE = 'style'
|
||||||
|
|
||||||
##
|
##
|
||||||
# Names of HTML tags of which the content should be lexed as-is.
|
# Names of HTML tags of which the content should be lexed as-is.
|
||||||
#
|
#
|
||||||
# @return [Array]
|
# @return [Array]
|
||||||
#
|
#
|
||||||
LITERAL_HTML_ELEMENTS = %w{script style}
|
LITERAL_HTML_ELEMENTS = [HTML_SCRIPT, HTML_STYLE]
|
||||||
|
|
||||||
##
|
##
|
||||||
# @param [String|IO] data The data to lex. This can either be a String or
|
# @param [String|IO] data The data to lex. This can either be a String or
|
||||||
|
@ -189,12 +195,17 @@ module Oga
|
||||||
end
|
end
|
||||||
|
|
||||||
##
|
##
|
||||||
# Returns true if the current element's content should be lexed as-is.
|
|
||||||
#
|
|
||||||
# @return [TrueClass|FalseClass]
|
# @return [TrueClass|FalseClass]
|
||||||
#
|
#
|
||||||
def literal_html_element?
|
def html_script?
|
||||||
return html? && LITERAL_HTML_ELEMENTS.include?(current_element)
|
return html? && current_element == HTML_SCRIPT
|
||||||
|
end
|
||||||
|
|
||||||
|
##
|
||||||
|
# @return [TrueClass|FalseClass]
|
||||||
|
#
|
||||||
|
def html_style?
|
||||||
|
return html? && current_element == HTML_STYLE
|
||||||
end
|
end
|
||||||
|
|
||||||
##
|
##
|
||||||
|
|
|
@ -176,7 +176,17 @@ xml_decl
|
||||||
# Plain text
|
# Plain text
|
||||||
|
|
||||||
text
|
text
|
||||||
= T_TEXT { on_text(val[0]) }
|
= T_TEXT text_follow
|
||||||
|
{
|
||||||
|
text = val[1] ? val[0] + val[1] : val[0]
|
||||||
|
|
||||||
|
on_text(text)
|
||||||
|
}
|
||||||
|
;
|
||||||
|
|
||||||
|
text_follow
|
||||||
|
= T_TEXT text_follow { val[1] ? val[0] + val[1] : val[0] }
|
||||||
|
| _ { nil }
|
||||||
;
|
;
|
||||||
|
|
||||||
# Strings
|
# Strings
|
||||||
|
|
|
@ -3,10 +3,24 @@ require 'spec_helper'
|
||||||
describe Oga::XML::Lexer do
|
describe Oga::XML::Lexer do
|
||||||
describe 'HTML script elements' do
|
describe 'HTML script elements' do
|
||||||
it 'treats the content of a script tag as plain text' do
|
it 'treats the content of a script tag as plain text' do
|
||||||
lex('<script>foo <bar</script>', :html => true).should == [
|
lex_html('<script>foo <bar</script>').should == [
|
||||||
[:T_ELEM_START, nil, 1],
|
[:T_ELEM_START, nil, 1],
|
||||||
[:T_ELEM_NAME, 'script', 1],
|
[:T_ELEM_NAME, 'script', 1],
|
||||||
[:T_TEXT, 'foo <bar', 1],
|
[:T_TEXT, 'foo ', 1],
|
||||||
|
[:T_TEXT, '<', 1],
|
||||||
|
[:T_TEXT, 'bar', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'treats style tags inside script tags as text' do
|
||||||
|
lex_html('<script><style></style></script>').should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'script', 1],
|
||||||
|
[:T_TEXT, '<', 1],
|
||||||
|
[:T_TEXT, 'style>', 1],
|
||||||
|
[:T_TEXT, '<', 1],
|
||||||
|
[:T_TEXT, '/style>', 1],
|
||||||
[:T_ELEM_END, nil, 1]
|
[:T_ELEM_END, nil, 1]
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
|
@ -3,7 +3,7 @@ require 'spec_helper'
|
||||||
describe Oga::XML::Lexer do
|
describe Oga::XML::Lexer do
|
||||||
describe 'HTML style elements' do
|
describe 'HTML style elements' do
|
||||||
it 'lexes an empty <style> tag' do
|
it 'lexes an empty <style> tag' do
|
||||||
lex('<style></style>', :html => true).should == [
|
lex_html('<style></style>').should == [
|
||||||
[:T_ELEM_START, nil, 1],
|
[:T_ELEM_START, nil, 1],
|
||||||
[:T_ELEM_NAME, 'style', 1],
|
[:T_ELEM_NAME, 'style', 1],
|
||||||
[:T_ELEM_END, nil, 1]
|
[:T_ELEM_END, nil, 1]
|
||||||
|
@ -11,16 +11,30 @@ describe Oga::XML::Lexer do
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'treats the content of a style tag as plain text' do
|
it 'treats the content of a style tag as plain text' do
|
||||||
lex('<style>foo <bar</style>', :html => true).should == [
|
lex_html('<style>foo <bar</style>').should == [
|
||||||
[:T_ELEM_START, nil, 1],
|
[:T_ELEM_START, nil, 1],
|
||||||
[:T_ELEM_NAME, 'style', 1],
|
[:T_ELEM_NAME, 'style', 1],
|
||||||
[:T_TEXT, 'foo <bar', 1],
|
[:T_TEXT, 'foo ', 1],
|
||||||
|
[:T_TEXT, '<', 1],
|
||||||
|
[:T_TEXT, 'bar', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'treats script tags inside style tags as text' do
|
||||||
|
lex_html('<style><script></script></style>').should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'style', 1],
|
||||||
|
[:T_TEXT, '<', 1],
|
||||||
|
[:T_TEXT, 'script>', 1],
|
||||||
|
[:T_TEXT, '<', 1],
|
||||||
|
[:T_TEXT, '/script>', 1],
|
||||||
[:T_ELEM_END, nil, 1]
|
[:T_ELEM_END, nil, 1]
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'lexes a multi-line <style> tag using a String as the input' do
|
it 'lexes a multi-line <style> tag using a String as the input' do
|
||||||
lex("<style>foo\nbar</style>", :html => true).should == [
|
lex_html("<style>foo\nbar</style>").should == [
|
||||||
[:T_ELEM_START, nil, 1],
|
[:T_ELEM_START, nil, 1],
|
||||||
[:T_ELEM_NAME, 'style', 1],
|
[:T_ELEM_NAME, 'style', 1],
|
||||||
[:T_TEXT, "foo\nbar", 1],
|
[:T_TEXT, "foo\nbar", 1],
|
||||||
|
@ -29,9 +43,7 @@ describe Oga::XML::Lexer do
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'lexes a multi-line <style> tag using an IO as the input' do
|
it 'lexes a multi-line <style> tag using an IO as the input' do
|
||||||
io = StringIO.new("<style>foo\nbar</style>")
|
lex_stringio("<style>foo\nbar</style>", :html => true).should == [
|
||||||
|
|
||||||
lex(io, :html => true).should == [
|
|
||||||
[:T_ELEM_START, nil, 1],
|
[:T_ELEM_START, nil, 1],
|
||||||
[:T_ELEM_NAME, 'style', 1],
|
[:T_ELEM_NAME, 'style', 1],
|
||||||
[:T_TEXT, "foo\n", 1],
|
[:T_TEXT, "foo\n", 1],
|
||||||
|
|
Loading…
Reference in New Issue