Handle parsing of HTML <style> tags.
This basically re-applies the technique used for HTML <script> tags. With this extra addition I decided to rename/normalize a few things so it's easier to add any extra tags in the future. One downside of this setup is that the following will not be parsed by Oga: <style> </script> </style> The same applies to script tags containing a literal </style> tag. Since this particular case is rather unlikely to occur I'm OK with not supporting it as it _does_ simplify the lexer quite a bit. Fixes #80
This commit is contained in:
parent
73534375d5
commit
78e40b55c0
|
@ -19,11 +19,11 @@ on `ts` and `te`) so the macro ignores this argument.
|
||||||
#define advance_line(amount) \
|
#define advance_line(amount) \
|
||||||
rb_funcall(self, id_advance_line, 1, INT2NUM(amount));
|
rb_funcall(self, id_advance_line, 1, INT2NUM(amount));
|
||||||
|
|
||||||
#define inside_html_script_p() \
|
#define literal_html_element_p() \
|
||||||
rb_funcall(self, id_inside_html_script_p, 0) == Qtrue
|
rb_funcall(self, id_literal_html_element_p, 0) == Qtrue
|
||||||
|
|
||||||
ID id_advance_line;
|
ID id_advance_line;
|
||||||
ID id_inside_html_script_p;
|
ID id_literal_html_element_p;
|
||||||
|
|
||||||
%%machine c_lexer;
|
%%machine c_lexer;
|
||||||
|
|
||||||
|
@ -173,8 +173,8 @@ void Init_liboga_xml_lexer()
|
||||||
VALUE mXML = rb_const_get(mOga, rb_intern("XML"));
|
VALUE mXML = rb_const_get(mOga, rb_intern("XML"));
|
||||||
VALUE cLexer = rb_define_class_under(mXML, "Lexer", rb_cObject);
|
VALUE cLexer = rb_define_class_under(mXML, "Lexer", rb_cObject);
|
||||||
|
|
||||||
id_advance_line = rb_intern("advance_line");
|
id_advance_line = rb_intern("advance_line");
|
||||||
id_inside_html_script_p = rb_intern("inside_html_script?");
|
id_literal_html_element_p = rb_intern("literal_html_element?");
|
||||||
|
|
||||||
rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1);
|
rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1);
|
||||||
rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0);
|
rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0);
|
||||||
|
|
|
@ -187,14 +187,13 @@ public class Lexer extends RubyObject
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns true if we're in an HTML script tag. See
|
* See * Oga::XML::Lexer#literal_html_element? for more information.
|
||||||
* Oga::XML::Lexer#inside_html_script? for more information.
|
|
||||||
*/
|
*/
|
||||||
public Boolean inside_html_script_p()
|
public Boolean literal_html_element_p()
|
||||||
{
|
{
|
||||||
ThreadContext context = this.runtime.getCurrentContext();
|
ThreadContext context = this.runtime.getCurrentContext();
|
||||||
|
|
||||||
return this.callMethod(context, "inside_html_script?").isTrue();
|
return this.callMethod(context, "literal_html_element?").isTrue();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -328,11 +328,11 @@
|
||||||
'>' => {
|
'>' => {
|
||||||
callback_simple(id_on_element_open_end);
|
callback_simple(id_on_element_open_end);
|
||||||
|
|
||||||
if ( inside_html_script_p() )
|
if ( literal_html_element_p() )
|
||||||
{
|
{
|
||||||
mark = ts + 1;
|
mark = ts + 1;
|
||||||
|
|
||||||
fnext script_text;
|
fnext literal_html_element;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -401,11 +401,11 @@
|
||||||
};
|
};
|
||||||
*|;
|
*|;
|
||||||
|
|
||||||
# <script> tags in HTML can contain basically anything except for the
|
# Certain tags in HTML can contain basically anything except for the literal
|
||||||
# literal "</script>". As a result of this we can't use the regular text
|
# closing tag. Two examples are script and style tags. As a result of this
|
||||||
# machine.
|
# we can't use the regular text machine.
|
||||||
script_text := |*
|
literal_html_element := |*
|
||||||
'</script>' => {
|
'</script>' | '</style>' => {
|
||||||
callback(id_on_text, data, encoding, mark, ts);
|
callback(id_on_text, data, encoding, mark, ts);
|
||||||
|
|
||||||
mark = 0;
|
mark = 0;
|
||||||
|
|
|
@ -41,12 +41,11 @@ module Oga
|
||||||
attr_reader :html
|
attr_reader :html
|
||||||
|
|
||||||
##
|
##
|
||||||
# Element name used to determine if a tag being processed is a Javascript
|
# Names of HTML tags of which the content should be lexed as-is.
|
||||||
# tag.
|
|
||||||
#
|
#
|
||||||
# @return [String]
|
# @return [Array]
|
||||||
#
|
#
|
||||||
SCRIPT_TAG = 'script'.freeze
|
LITERAL_HTML_ELEMENTS = %w{script style}
|
||||||
|
|
||||||
##
|
##
|
||||||
# @param [String|IO] data The data to lex. This can either be a String or
|
# @param [String|IO] data The data to lex. This can either be a String or
|
||||||
|
@ -190,12 +189,12 @@ module Oga
|
||||||
end
|
end
|
||||||
|
|
||||||
##
|
##
|
||||||
# Returns true if the current element is the HTML `<script>` element.
|
# Returns true if the current element's content should be lexed as-is.
|
||||||
#
|
#
|
||||||
# @return [TrueClass|FalseClass]
|
# @return [TrueClass|FalseClass]
|
||||||
#
|
#
|
||||||
def inside_html_script?
|
def literal_html_element?
|
||||||
return html? && current_element == SCRIPT_TAG
|
return html? && LITERAL_HTML_ELEMENTS.include?(current_element)
|
||||||
end
|
end
|
||||||
|
|
||||||
##
|
##
|
||||||
|
|
|
@ -12,8 +12,8 @@ module Oga
|
||||||
node = parent
|
node = parent
|
||||||
root = root_node
|
root = root_node
|
||||||
|
|
||||||
if root.is_a?(Document) and node.is_a?(Element) \
|
if root.is_a?(Document) and node.is_a?(Element) and root.html? \
|
||||||
and node.name == Lexer::SCRIPT_TAG and root.html?
|
and Lexer::LITERAL_HTML_ELEMENTS.include?(node.name)
|
||||||
return super
|
return super
|
||||||
else
|
else
|
||||||
return Entities.encode(super)
|
return Entities.encode(super)
|
||||||
|
|
|
@ -2,7 +2,7 @@ require 'spec_helper'
|
||||||
|
|
||||||
describe Oga::XML::Lexer do
|
describe Oga::XML::Lexer do
|
||||||
describe 'HTML script elements' do
|
describe 'HTML script elements' do
|
||||||
it 'treats all contents of a script tag as plain text' do
|
it 'treats the content of a script tag as plain text' do
|
||||||
lex('<script>foo <bar</script>', :html => true).should == [
|
lex('<script>foo <bar</script>', :html => true).should == [
|
||||||
[:T_ELEM_START, nil, 1],
|
[:T_ELEM_START, nil, 1],
|
||||||
[:T_ELEM_NAME, 'script', 1],
|
[:T_ELEM_NAME, 'script', 1],
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
require 'spec_helper'
|
||||||
|
|
||||||
|
describe Oga::XML::Lexer do
|
||||||
|
describe 'HTML style elements' do
|
||||||
|
it 'treats the content of a style tag as plain text' do
|
||||||
|
lex('<style>foo <bar</style>', :html => true).should == [
|
||||||
|
[:T_ELEM_START, nil, 1],
|
||||||
|
[:T_ELEM_NAME, 'style', 1],
|
||||||
|
[:T_TEXT, 'foo <bar', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -52,6 +52,19 @@ describe Oga::XML::Text do
|
||||||
text.to_xml.should == 'x > y'
|
text.to_xml.should == 'x > y'
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
describe 'inside an HTML <style> element' do
|
||||||
|
it 'does not encode special characters as XML entities' do
|
||||||
|
document = Oga::XML::Document.new(:type => :html)
|
||||||
|
style = Oga::XML::Element.new(:name => 'style')
|
||||||
|
text = described_class.new(:text => 'x > y')
|
||||||
|
|
||||||
|
style.children << text
|
||||||
|
document.children << style
|
||||||
|
|
||||||
|
text.to_xml.should == 'x > y'
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
describe '#inspect' do
|
describe '#inspect' do
|
||||||
|
|
Loading…
Reference in New Issue