diff --git a/ext/c/lexer.rl b/ext/c/lexer.rl index 0defe64..73039dd 100644 --- a/ext/c/lexer.rl +++ b/ext/c/lexer.rl @@ -17,7 +17,13 @@ on `ts` and `te`) so the macro ignores this argument. liboga_xml_lexer_callback_simple(self, name); #define advance_line(amount) \ - rb_funcall(self, rb_intern("advance_line"), 1, INT2NUM(amount)); + rb_funcall(self, id_advance_line, 1, INT2NUM(amount)); + +#define inside_html_script_p() \ + rb_funcall(self, id_inside_html_script_p, 0) == Qtrue + +ID id_advance_line; +ID id_inside_html_script_p; %%machine c_lexer; @@ -167,6 +173,9 @@ void Init_liboga_xml_lexer() VALUE mXML = rb_const_get(mOga, rb_intern("XML")); VALUE cLexer = rb_define_class_under(mXML, "Lexer", rb_cObject); + id_advance_line = rb_intern("advance_line"); + id_inside_html_script_p = rb_intern("inside_html_script?"); + rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1); rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0); diff --git a/ext/java/org/liboga/xml/Lexer.rl b/ext/java/org/liboga/xml/Lexer.rl index d746dd2..a359234 100644 --- a/ext/java/org/liboga/xml/Lexer.rl +++ b/ext/java/org/liboga/xml/Lexer.rl @@ -185,6 +185,17 @@ public class Lexer extends RubyObject this.callMethod(context, "advance_line", lines); } + + /** + * Returns true if we're in an HTML script tag. See + * Oga::XML::Lexer#inside_html_script? for more information. + */ + public Boolean inside_html_script_p() + { + ThreadContext context = this.runtime.getCurrentContext(); + + return this.callMethod(context, "inside_html_script?").isTrue(); + } } %%{ diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index 305f9e2..9b107d9 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -327,7 +327,17 @@ # We're done with the open tag of the element. '>' => { callback_simple(id_on_element_open_end); - fnext main; + + if ( inside_html_script_p() ) + { + mark = ts + 1; + + fnext script_text; + } + else + { + fnext main; + } }; # Self closing tags. @@ -391,6 +401,30 @@ }; *|; + # ". As a result of this we can't use the regular text + # machine. + script_text := |* + '' => { + callback(id_on_text, data, encoding, mark, ts); + + mark = 0; + + if ( lines > 0 ) + { + advance_line(lines); + + lines = 0; + } + + callback_simple(id_on_element_end); + + fnext main; + }; + + any $count_newlines; + *|; + # The main machine aka the entry point of Ragel. main := |* doctype_start => start_doctype; diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb index fa1757c..14f0784 100644 --- a/lib/oga/xml/lexer.rb +++ b/lib/oga/xml/lexer.rb @@ -40,6 +40,14 @@ module Oga class Lexer attr_reader :html + ## + # Element name used to determine if a tag being processed is a Javascript + # tag. + # + # @return [String] + # + SCRIPT_TAG = 'script'.freeze + ## # @param [String|IO] data The data to lex. This can either be a String or # an IO instance. @@ -181,6 +189,15 @@ module Oga return @elements.last end + ## + # Returns true if the current element is the HTML `