Handle parsing of HTML <style> tags.

This basically re-applies the technique used for HTML <script> tags. With this
extra addition I decided to rename/normalize a few things so it's easier to add
any extra tags in the future. One downside of this setup is that the following
will not be parsed by Oga:

    <style>
        </script>
    </style>

The same applies to script tags containing a literal </style> tag. Since this
particular case is rather unlikely to occur I'm OK with not supporting it as it
_does_ simplify the lexer quite a bit.

Fixes #80
This commit is contained in:
Yorick Peterse 2015-03-03 16:28:05 +01:00
parent 73534375d5
commit 78e40b55c0
8 changed files with 51 additions and 26 deletions

View File

@ -19,11 +19,11 @@ on `ts` and `te`) so the macro ignores this argument.
#define advance_line(amount) \ #define advance_line(amount) \
rb_funcall(self, id_advance_line, 1, INT2NUM(amount)); rb_funcall(self, id_advance_line, 1, INT2NUM(amount));
#define inside_html_script_p() \ #define literal_html_element_p() \
rb_funcall(self, id_inside_html_script_p, 0) == Qtrue rb_funcall(self, id_literal_html_element_p, 0) == Qtrue
ID id_advance_line; ID id_advance_line;
ID id_inside_html_script_p; ID id_literal_html_element_p;
%%machine c_lexer; %%machine c_lexer;
@ -173,8 +173,8 @@ void Init_liboga_xml_lexer()
VALUE mXML = rb_const_get(mOga, rb_intern("XML")); VALUE mXML = rb_const_get(mOga, rb_intern("XML"));
VALUE cLexer = rb_define_class_under(mXML, "Lexer", rb_cObject); VALUE cLexer = rb_define_class_under(mXML, "Lexer", rb_cObject);
id_advance_line = rb_intern("advance_line"); id_advance_line = rb_intern("advance_line");
id_inside_html_script_p = rb_intern("inside_html_script?"); id_literal_html_element_p = rb_intern("literal_html_element?");
rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1); rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1);
rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0); rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0);

View File

@ -187,14 +187,13 @@ public class Lexer extends RubyObject
} }
/** /**
* Returns true if we're in an HTML script tag. See * See * Oga::XML::Lexer#literal_html_element? for more information.
* Oga::XML::Lexer#inside_html_script? for more information.
*/ */
public Boolean inside_html_script_p() public Boolean literal_html_element_p()
{ {
ThreadContext context = this.runtime.getCurrentContext(); ThreadContext context = this.runtime.getCurrentContext();
return this.callMethod(context, "inside_html_script?").isTrue(); return this.callMethod(context, "literal_html_element?").isTrue();
} }
} }

View File

@ -328,11 +328,11 @@
'>' => { '>' => {
callback_simple(id_on_element_open_end); callback_simple(id_on_element_open_end);
if ( inside_html_script_p() ) if ( literal_html_element_p() )
{ {
mark = ts + 1; mark = ts + 1;
fnext script_text; fnext literal_html_element;
} }
else else
{ {
@ -401,11 +401,11 @@
}; };
*|; *|;
# <script> tags in HTML can contain basically anything except for the # Certain tags in HTML can contain basically anything except for the literal
# literal "</script>". As a result of this we can't use the regular text # closing tag. Two examples are script and style tags. As a result of this
# machine. # we can't use the regular text machine.
script_text := |* literal_html_element := |*
'</script>' => { '</script>' | '</style>' => {
callback(id_on_text, data, encoding, mark, ts); callback(id_on_text, data, encoding, mark, ts);
mark = 0; mark = 0;

View File

@ -41,12 +41,11 @@ module Oga
attr_reader :html attr_reader :html
## ##
# Element name used to determine if a tag being processed is a Javascript # Names of HTML tags of which the content should be lexed as-is.
# tag.
# #
# @return [String] # @return [Array]
# #
SCRIPT_TAG = 'script'.freeze LITERAL_HTML_ELEMENTS = %w{script style}
## ##
# @param [String|IO] data The data to lex. This can either be a String or # @param [String|IO] data The data to lex. This can either be a String or
@ -190,12 +189,12 @@ module Oga
end end
## ##
# Returns true if the current element is the HTML `<script>` element. # Returns true if the current element's content should be lexed as-is.
# #
# @return [TrueClass|FalseClass] # @return [TrueClass|FalseClass]
# #
def inside_html_script? def literal_html_element?
return html? && current_element == SCRIPT_TAG return html? && LITERAL_HTML_ELEMENTS.include?(current_element)
end end
## ##

View File

@ -12,8 +12,8 @@ module Oga
node = parent node = parent
root = root_node root = root_node
if root.is_a?(Document) and node.is_a?(Element) \ if root.is_a?(Document) and node.is_a?(Element) and root.html? \
and node.name == Lexer::SCRIPT_TAG and root.html? and Lexer::LITERAL_HTML_ELEMENTS.include?(node.name)
return super return super
else else
return Entities.encode(super) return Entities.encode(super)

View File

@ -2,7 +2,7 @@ require 'spec_helper'
describe Oga::XML::Lexer do describe Oga::XML::Lexer do
describe 'HTML script elements' do describe 'HTML script elements' do
it 'treats all contents of a script tag as plain text' do it 'treats the content of a script tag as plain text' do
lex('<script>foo <bar</script>', :html => true).should == [ lex('<script>foo <bar</script>', :html => true).should == [
[:T_ELEM_START, nil, 1], [:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'script', 1], [:T_ELEM_NAME, 'script', 1],

View File

@ -0,0 +1,14 @@
require 'spec_helper'
describe Oga::XML::Lexer do
describe 'HTML style elements' do
it 'treats the content of a style tag as plain text' do
lex('<style>foo <bar</style>', :html => true).should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'style', 1],
[:T_TEXT, 'foo <bar', 1],
[:T_ELEM_END, nil, 1]
]
end
end
end

View File

@ -52,6 +52,19 @@ describe Oga::XML::Text do
text.to_xml.should == 'x > y' text.to_xml.should == 'x > y'
end end
end end
describe 'inside an HTML <style> element' do
it 'does not encode special characters as XML entities' do
document = Oga::XML::Document.new(:type => :html)
style = Oga::XML::Element.new(:name => 'style')
text = described_class.new(:text => 'x > y')
style.children << text
document.children << style
text.to_xml.should == 'x > y'
end
end
end end
describe '#inspect' do describe '#inspect' do