From 73fbbfbdbdecafcf5f873b8a27e81c19a2e2ed0c Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Thu, 16 Apr 2015 01:45:39 +0200 Subject: [PATCH] Use separate Ragel machines for script/style tags Previously a single Ragel machine was used for processing HTML script and style tags. This had the unfortunate side-effect that the following was not parsed correctly (while being valid HTML): The same applied to style tags: By using separate machines we can work around the above issue. The downside is that this can produce multiple T_TEXT nodes, which have to be stitched back together in the parser. --- ext/c/lexer.rl | 17 ++++--- ext/java/org/liboga/xml/Lexer.rl | 16 +++++-- ext/ragel/base_lexer.rl | 62 +++++++++++++------------- lib/oga/xml/lexer.rb | 21 ++++++--- lib/oga/xml/parser.rll | 12 ++++- spec/oga/xml/lexer/html_script_spec.rb | 18 +++++++- spec/oga/xml/lexer/html_style_spec.rb | 26 ++++++++--- 7 files changed, 118 insertions(+), 54 deletions(-) diff --git a/ext/c/lexer.rl b/ext/c/lexer.rl index c1bff37..c122e9f 100644 --- a/ext/c/lexer.rl +++ b/ext/c/lexer.rl @@ -19,11 +19,15 @@ on `ts` and `te`) so the macro ignores this argument. #define advance_line(amount) \ rb_funcall(self, id_advance_line, 1, INT2NUM(amount)); -#define literal_html_element_p() \ - rb_funcall(self, id_literal_html_element_p, 0) == Qtrue +#define html_script_p() \ + rb_funcall(self, id_html_script_p, 0) == Qtrue + +#define html_style_p() \ + rb_funcall(self, id_html_style_p, 0) == Qtrue ID id_advance_line; -ID id_literal_html_element_p; +ID id_html_script_p; +ID id_html_style_p; ID id_html; %%machine c_lexer; @@ -183,9 +187,10 @@ void Init_liboga_xml_lexer() VALUE mXML = rb_const_get(mOga, rb_intern("XML")); VALUE cLexer = rb_define_class_under(mXML, "Lexer", rb_cObject); - id_advance_line = rb_intern("advance_line"); - id_literal_html_element_p = rb_intern("literal_html_element?"); - id_html = rb_intern("html"); + id_advance_line = rb_intern("advance_line"); + id_html_script_p = rb_intern("html_script?"); + id_html_style_p = rb_intern("html_style?"); + id_html = rb_intern("html"); rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1); rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0); diff --git a/ext/java/org/liboga/xml/Lexer.rl b/ext/java/org/liboga/xml/Lexer.rl index 28ada04..458f9c9 100644 --- a/ext/java/org/liboga/xml/Lexer.rl +++ b/ext/java/org/liboga/xml/Lexer.rl @@ -194,13 +194,23 @@ public class Lexer extends RubyObject } /** - * See * Oga::XML::Lexer#literal_html_element? for more information. + * @see Oga::XML::Lexer#html_script? */ - public Boolean literal_html_element_p() + public Boolean html_script_p() { ThreadContext context = this.runtime.getCurrentContext(); - return this.callMethod(context, "literal_html_element?").isTrue(); + return this.callMethod(context, "html_script?").isTrue(); + } + + /** + * @see Oga::XML::Lexer#html_style? + */ + public Boolean html_style_p() + { + ThreadContext context = this.runtime.getCurrentContext(); + + return this.callMethod(context, "html_style?").isTrue(); } } diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index ba67520..27a6cb0 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -58,7 +58,7 @@ } action advance_newline { - advance_line(1) + advance_line(1); } action hold_and_return { @@ -376,6 +376,12 @@ callback_simple(id_on_element_end); } + action close_element_fnext_main { + callback_simple(id_on_element_end); + + fnext main; + } + # Machine used for lexing the name/namespace of an element. element_name := |* identifier ':' => { @@ -465,9 +471,13 @@ '>' => { callback_simple(id_on_element_open_end); - if ( literal_html_element_p() ) + if ( html_script_p() ) { - fnext literal_html_element; + fnext html_script; + } + else if ( html_style_p() ) + { + fnext html_style; } else { @@ -506,6 +516,17 @@ terminate_text = ' 0 ) + { + advance_line(lines); + + lines = 0; + } + } + text := |* terminate_text | allowed_text => { callback(id_on_text, data, encoding, ts, te); @@ -541,36 +562,17 @@ # Certain tags in HTML can contain basically anything except for the literal # closing tag. Two examples are script and style tags. As a result of this # we can't use the regular text machine. - literal_html_closing_tags = '' | ''; - literal_html_allowed = (any* -- literal_html_closing_tags) $count_newlines; - literal_html_element := |* - literal_html_allowed => { - callback(id_on_text, data, encoding, ts, te); + literal_html_allowed = (^'<'+ | '<'+) $count_newlines; - if ( lines > 0 ) - { - advance_line(lines); + html_script := |* + literal_html_allowed => emit_text; + '' => close_element_fnext_main; + *|; - lines = 0; - } - }; - - literal_html_allowed %{ mark = p; } literal_html_closing_tags => { - callback(id_on_text, data, encoding, ts, mark); - - p = mark - 1; - mark = 0; - - if ( lines > 0 ) - { - advance_line(lines); - - lines = 0; - } - - fnext main; - }; + html_style := |* + literal_html_allowed => emit_text; + '' => close_element_fnext_main; *|; # The main machine aka the entry point of Ragel. diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb index e9ea474..4b05b29 100644 --- a/lib/oga/xml/lexer.rb +++ b/lib/oga/xml/lexer.rb @@ -40,12 +40,18 @@ module Oga class Lexer attr_reader :html + # @return [String] + HTML_SCRIPT = 'script' + + # @return [String] + HTML_STYLE = 'style' + ## # Names of HTML tags of which the content should be lexed as-is. # # @return [Array] # - LITERAL_HTML_ELEMENTS = %w{script style} + LITERAL_HTML_ELEMENTS = [HTML_SCRIPT, HTML_STYLE] ## # @param [String|IO] data The data to lex. This can either be a String or @@ -189,12 +195,17 @@ module Oga end ## - # Returns true if the current element's content should be lexed as-is. - # # @return [TrueClass|FalseClass] # - def literal_html_element? - return html? && LITERAL_HTML_ELEMENTS.include?(current_element) + def html_script? + return html? && current_element == HTML_SCRIPT + end + + ## + # @return [TrueClass|FalseClass] + # + def html_style? + return html? && current_element == HTML_STYLE end ## diff --git a/lib/oga/xml/parser.rll b/lib/oga/xml/parser.rll index 4657762..3343ebd 100644 --- a/lib/oga/xml/parser.rll +++ b/lib/oga/xml/parser.rll @@ -176,7 +176,17 @@ xml_decl # Plain text text - = T_TEXT { on_text(val[0]) } + = T_TEXT text_follow + { + text = val[1] ? val[0] + val[1] : val[0] + + on_text(text) + } + ; + +text_follow + = T_TEXT text_follow { val[1] ? val[0] + val[1] : val[0] } + | _ { nil } ; # Strings diff --git a/spec/oga/xml/lexer/html_script_spec.rb b/spec/oga/xml/lexer/html_script_spec.rb index 539dc10..c958e54 100644 --- a/spec/oga/xml/lexer/html_script_spec.rb +++ b/spec/oga/xml/lexer/html_script_spec.rb @@ -3,10 +3,24 @@ require 'spec_helper' describe Oga::XML::Lexer do describe 'HTML script elements' do it 'treats the content of a script tag as plain text' do - lex('', :html => true).should == [ + lex_html('').should == [ [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'script', 1], - [:T_TEXT, 'foo ').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'script', 1], + [:T_TEXT, '<', 1], + [:T_TEXT, 'style>', 1], + [:T_TEXT, '<', 1], + [:T_TEXT, '/style>', 1], [:T_ELEM_END, nil, 1] ] end diff --git a/spec/oga/xml/lexer/html_style_spec.rb b/spec/oga/xml/lexer/html_style_spec.rb index 6ac7353..310ee03 100644 --- a/spec/oga/xml/lexer/html_style_spec.rb +++ b/spec/oga/xml/lexer/html_style_spec.rb @@ -3,7 +3,7 @@ require 'spec_helper' describe Oga::XML::Lexer do describe 'HTML style elements' do it 'lexes an empty ', :html => true).should == [ + lex_html('').should == [ [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'style', 1], [:T_ELEM_END, nil, 1] @@ -11,16 +11,30 @@ describe Oga::XML::Lexer do end it 'treats the content of a style tag as plain text' do - lex('', :html => true).should == [ + lex_html('').should == [ [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'style', 1], - [:T_TEXT, 'foo ').should == [ + [:T_ELEM_START, nil, 1], + [:T_ELEM_NAME, 'style', 1], + [:T_TEXT, '<', 1], + [:T_TEXT, 'script>', 1], + [:T_TEXT, '<', 1], + [:T_TEXT, '/script>', 1], [:T_ELEM_END, nil, 1] ] end it 'lexes a multi-line ", :html => true).should == [ + lex_html("").should == [ [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'style', 1], [:T_TEXT, "foo\nbar", 1], @@ -29,9 +43,7 @@ describe Oga::XML::Lexer do end it 'lexes a multi-line ") - - lex(io, :html => true).should == [ + lex_stringio("", :html => true).should == [ [:T_ELEM_START, nil, 1], [:T_ELEM_NAME, 'style', 1], [:T_TEXT, "foo\n", 1],