Lex contents of <script> tags as plain text.

When lexing input in HTML mode the lexer has to treat _all_ content of a
<script> tag as plain text. This ensures that the lexer can process input such
as "x <y" and "// <foo>" correctly.

Fixes #70.
This commit is contained in:
Yorick Peterse 2015-03-02 16:20:54 +01:00
parent 351b5ac004
commit ba2177e2cf
4 changed files with 73 additions and 2 deletions

View File

@ -17,7 +17,13 @@ on `ts` and `te`) so the macro ignores this argument.
liboga_xml_lexer_callback_simple(self, name); liboga_xml_lexer_callback_simple(self, name);
#define advance_line(amount) \ #define advance_line(amount) \
rb_funcall(self, rb_intern("advance_line"), 1, INT2NUM(amount)); rb_funcall(self, id_advance_line, 1, INT2NUM(amount));
#define inside_html_script_p() \
rb_funcall(self, id_inside_html_script_p, 0) == Qtrue
ID id_advance_line;
ID id_inside_html_script_p;
%%machine c_lexer; %%machine c_lexer;
@ -167,6 +173,9 @@ void Init_liboga_xml_lexer()
VALUE mXML = rb_const_get(mOga, rb_intern("XML")); VALUE mXML = rb_const_get(mOga, rb_intern("XML"));
VALUE cLexer = rb_define_class_under(mXML, "Lexer", rb_cObject); VALUE cLexer = rb_define_class_under(mXML, "Lexer", rb_cObject);
id_advance_line = rb_intern("advance_line");
id_inside_html_script_p = rb_intern("inside_html_script?");
rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1); rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1);
rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0); rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0);

View File

@ -185,6 +185,17 @@ public class Lexer extends RubyObject
this.callMethod(context, "advance_line", lines); this.callMethod(context, "advance_line", lines);
} }
/**
* Returns true if we're in an HTML script tag. See
* Oga::XML::Lexer#inside_html_script? for more information.
*/
public Boolean inside_html_script_p()
{
ThreadContext context = this.runtime.getCurrentContext();
return this.callMethod(context, "inside_html_script?").isTrue();
}
} }
%%{ %%{

View File

@ -327,7 +327,17 @@
# We're done with the open tag of the element. # We're done with the open tag of the element.
'>' => { '>' => {
callback_simple(id_on_element_open_end); callback_simple(id_on_element_open_end);
if ( inside_html_script_p() )
{
mark = ts + 1;
fnext script_text;
}
else
{
fnext main; fnext main;
}
}; };
# Self closing tags. # Self closing tags.
@ -391,6 +401,30 @@
}; };
*|; *|;
# <script> tags in HTML can contain basically anything except for the
# literal "</script>". As a result of this we can't use the regular text
# machine.
script_text := |*
'</script>' => {
callback(id_on_text, data, encoding, mark, ts);
mark = 0;
if ( lines > 0 )
{
advance_line(lines);
lines = 0;
}
callback_simple(id_on_element_end);
fnext main;
};
any $count_newlines;
*|;
# The main machine aka the entry point of Ragel. # The main machine aka the entry point of Ragel.
main := |* main := |*
doctype_start => start_doctype; doctype_start => start_doctype;

View File

@ -40,6 +40,14 @@ module Oga
class Lexer class Lexer
attr_reader :html attr_reader :html
##
# Element name used to determine if a tag being processed is a Javascript
# tag.
#
# @return [String]
#
SCRIPT_TAG = 'script'.freeze
## ##
# @param [String|IO] data The data to lex. This can either be a String or # @param [String|IO] data The data to lex. This can either be a String or
# an IO instance. # an IO instance.
@ -181,6 +189,15 @@ module Oga
return @elements.last return @elements.last
end end
##
# Returns true if the current element is the HTML `<script>` element.
#
# @return [TrueClass|FalseClass]
#
def inside_html_script?
return html? && current_element == SCRIPT_TAG
end
## ##
# Called when processing a single quote. # Called when processing a single quote.
# #