Handle parsing of HTML <style> tags.

This basically re-applies the technique used for HTML <script> tags. With this extra addition I decided to rename/normalize a few things so it's easier to add any extra tags in the future. One downside of this setup is that the following will not be parsed by Oga: <style> </script> </style> The same applies to script tags containing a literal </style> tag. Since this particular case is rather unlikely to occur I'm OK with not supporting it as it _does_ simplify the lexer quite a bit. Fixes #80
2015-03-03 16:28:05 +01:00 · 2015-03-03 16:28:05 +01:00 · 78e40b55c0
parent 73534375d5
commit 78e40b55c0
8 changed files with 51 additions and 26 deletions
--- a/ext/c/lexer.rl
+++ b/ext/c/lexer.rl
@ -19,11 +19,11 @@ on `ts` and `te`) so the macro ignores this argument.
 #define advance_line(amount) \
    rb_funcall(self, id_advance_line, 1, INT2NUM(amount));

-#define inside_html_script_p() \
-    rb_funcall(self, id_inside_html_script_p, 0) == Qtrue
+#define literal_html_element_p() \
+    rb_funcall(self, id_literal_html_element_p, 0) == Qtrue

 ID id_advance_line;
-ID id_inside_html_script_p;
+ID id_literal_html_element_p;

 %%machine c_lexer;

@ -173,8 +173,8 @@ void Init_liboga_xml_lexer()
    VALUE mXML   = rb_const_get(mOga, rb_intern("XML"));
    VALUE cLexer = rb_define_class_under(mXML, "Lexer", rb_cObject);

-    id_advance_line         = rb_intern("advance_line");
-    id_inside_html_script_p = rb_intern("inside_html_script?");
+    id_advance_line           = rb_intern("advance_line");
+    id_literal_html_element_p = rb_intern("literal_html_element?");

    rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1);
    rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0);
--- a/ext/java/org/liboga/xml/Lexer.rl
+++ b/ext/java/org/liboga/xml/Lexer.rl
@ -187,14 +187,13 @@ public class Lexer extends RubyObject
    }

    /**
-     * Returns true if we're in an HTML script tag. See
-     * Oga::XML::Lexer#inside_html_script? for more information.
+     * See * Oga::XML::Lexer#literal_html_element? for more information.
     */
-    public Boolean inside_html_script_p()
+    public Boolean literal_html_element_p()
    {
        ThreadContext context = this.runtime.getCurrentContext();

-        return this.callMethod(context, "inside_html_script?").isTrue();
+        return this.callMethod(context, "literal_html_element?").isTrue();
    }
 }

--- a/ext/ragel/base_lexer.rl
+++ b/ext/ragel/base_lexer.rl
@ -328,11 +328,11 @@
        '>' => {
            callback_simple(id_on_element_open_end);

-            if ( inside_html_script_p() )
+            if ( literal_html_element_p() )
            {
                mark = ts + 1;

-                fnext script_text;
+                fnext literal_html_element;
            }
            else
            {
@ -401,11 +401,11 @@
        };
    *|;

-    # <script> tags in HTML can contain basically anything except for the
-    # literal "</script>". As a result of this we can't use the regular text
-    # machine.
-    script_text := |*
-        '</script>' => {
+    # Certain tags in HTML can contain basically anything except for the literal
+    # closing tag. Two examples are script and style tags.  As a result of this
+    # we can't use the regular text machine.
+    literal_html_element := |*
+        '</script>' | '</style>' => {
            callback(id_on_text, data, encoding, mark, ts);

            mark = 0;
--- a/lib/oga/xml/lexer.rb
+++ b/lib/oga/xml/lexer.rb
@ -41,12 +41,11 @@ module Oga
      attr_reader :html

      ##
-      # Element name used to determine if a tag being processed is a Javascript
-      # tag.
+      # Names of HTML tags of which the content should be lexed as-is.
      #
-      # @return [String]
+      # @return [Array]
      #
-      SCRIPT_TAG = 'script'.freeze
+      LITERAL_HTML_ELEMENTS = %w{script style}

      ##
      # @param [String|IO] data The data to lex. This can either be a String or
@ -190,12 +189,12 @@ module Oga
      end

      ##
-      # Returns true if the current element is the HTML `<script>` element.
+      # Returns true if the current element's content should be lexed as-is.
      #
      # @return [TrueClass|FalseClass]
      #
-      def inside_html_script?
-        return html? && current_element == SCRIPT_TAG
+      def literal_html_element?
+        return html? && LITERAL_HTML_ELEMENTS.include?(current_element)
      end

      ##
--- a/lib/oga/xml/text.rb
+++ b/lib/oga/xml/text.rb
@ -12,8 +12,8 @@ module Oga
        node = parent
        root = root_node

-        if root.is_a?(Document) and node.is_a?(Element) \
-        and node.name == Lexer::SCRIPT_TAG and root.html?
+        if root.is_a?(Document) and node.is_a?(Element) and root.html? \
+        and Lexer::LITERAL_HTML_ELEMENTS.include?(node.name)
          return super
        else
          return Entities.encode(super)
--- a/spec/oga/xml/lexer/html_script_spec.rb
+++ b/spec/oga/xml/lexer/html_script_spec.rb
@ -2,7 +2,7 @@ require 'spec_helper'

 describe Oga::XML::Lexer do
  describe 'HTML script elements' do
-    it 'treats all contents of a script tag as plain text' do
+    it 'treats the content of a script tag as plain text' do
      lex('<script>foo <bar</script>', :html => true).should == [
        [:T_ELEM_START, nil, 1],
        [:T_ELEM_NAME, 'script', 1],
--- a/spec/oga/xml/lexer/html_style_spec.rb
+++ b/spec/oga/xml/lexer/html_style_spec.rb
@ -0,0 +1,14 @@
+require 'spec_helper'
+
+describe Oga::XML::Lexer do
+  describe 'HTML style elements' do
+    it 'treats the content of a style tag as plain text' do
+      lex('<style>foo <bar</style>', :html => true).should == [
+        [:T_ELEM_START, nil, 1],
+        [:T_ELEM_NAME, 'style', 1],
+        [:T_TEXT, 'foo <bar', 1],
+        [:T_ELEM_END, nil, 1]
+      ]
+    end
+  end
+end
--- a/spec/oga/xml/text_spec.rb
+++ b/spec/oga/xml/text_spec.rb
@ -52,6 +52,19 @@ describe Oga::XML::Text do
        text.to_xml.should == 'x > y'
      end
    end
+
+    describe 'inside an HTML <style> element' do
+      it 'does not encode special characters as XML entities' do
+        document = Oga::XML::Document.new(:type => :html)
+        style    = Oga::XML::Element.new(:name => 'style')
+        text     = described_class.new(:text => 'x > y')
+
+        style.children    << text
+        document.children << style
+
+        text.to_xml.should == 'x > y'
+      end
+    end
  end

  describe '#inspect' do