diff --git a/ext/c/lexer.rl b/ext/c/lexer.rl
index c1bff37..c122e9f 100644
--- a/ext/c/lexer.rl
+++ b/ext/c/lexer.rl
@@ -19,11 +19,15 @@ on `ts` and `te`) so the macro ignores this argument.
#define advance_line(amount) \
rb_funcall(self, id_advance_line, 1, INT2NUM(amount));
-#define literal_html_element_p() \
- rb_funcall(self, id_literal_html_element_p, 0) == Qtrue
+#define html_script_p() \
+ rb_funcall(self, id_html_script_p, 0) == Qtrue
+
+#define html_style_p() \
+ rb_funcall(self, id_html_style_p, 0) == Qtrue
ID id_advance_line;
-ID id_literal_html_element_p;
+ID id_html_script_p;
+ID id_html_style_p;
ID id_html;
%%machine c_lexer;
@@ -183,9 +187,10 @@ void Init_liboga_xml_lexer()
VALUE mXML = rb_const_get(mOga, rb_intern("XML"));
VALUE cLexer = rb_define_class_under(mXML, "Lexer", rb_cObject);
- id_advance_line = rb_intern("advance_line");
- id_literal_html_element_p = rb_intern("literal_html_element?");
- id_html = rb_intern("html");
+ id_advance_line = rb_intern("advance_line");
+ id_html_script_p = rb_intern("html_script?");
+ id_html_style_p = rb_intern("html_style?");
+ id_html = rb_intern("html");
rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1);
rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0);
diff --git a/ext/java/org/liboga/xml/Lexer.rl b/ext/java/org/liboga/xml/Lexer.rl
index 28ada04..458f9c9 100644
--- a/ext/java/org/liboga/xml/Lexer.rl
+++ b/ext/java/org/liboga/xml/Lexer.rl
@@ -194,13 +194,23 @@ public class Lexer extends RubyObject
}
/**
- * See * Oga::XML::Lexer#literal_html_element? for more information.
+ * @see Oga::XML::Lexer#html_script?
*/
- public Boolean literal_html_element_p()
+ public Boolean html_script_p()
{
ThreadContext context = this.runtime.getCurrentContext();
- return this.callMethod(context, "literal_html_element?").isTrue();
+ return this.callMethod(context, "html_script?").isTrue();
+ }
+
+ /**
+ * @see Oga::XML::Lexer#html_style?
+ */
+ public Boolean html_style_p()
+ {
+ ThreadContext context = this.runtime.getCurrentContext();
+
+ return this.callMethod(context, "html_style?").isTrue();
}
}
diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl
index ba67520..27a6cb0 100644
--- a/ext/ragel/base_lexer.rl
+++ b/ext/ragel/base_lexer.rl
@@ -58,7 +58,7 @@
}
action advance_newline {
- advance_line(1)
+ advance_line(1);
}
action hold_and_return {
@@ -376,6 +376,12 @@
callback_simple(id_on_element_end);
}
+ action close_element_fnext_main {
+ callback_simple(id_on_element_end);
+
+ fnext main;
+ }
+
# Machine used for lexing the name/namespace of an element.
element_name := |*
identifier ':' => {
@@ -465,9 +471,13 @@
'>' => {
callback_simple(id_on_element_open_end);
- if ( literal_html_element_p() )
+ if ( html_script_p() )
{
- fnext literal_html_element;
+ fnext html_script;
+ }
+ else if ( html_style_p() )
+ {
+ fnext html_style;
}
else
{
@@ -506,6 +516,17 @@
terminate_text = '' | ' 0 )
+ {
+ advance_line(lines);
+
+ lines = 0;
+ }
+ }
+
text := |*
terminate_text | allowed_text => {
callback(id_on_text, data, encoding, ts, te);
@@ -541,36 +562,17 @@
# Certain tags in HTML can contain basically anything except for the literal
# closing tag. Two examples are script and style tags. As a result of this
# we can't use the regular text machine.
- literal_html_closing_tags = '' | '';
- literal_html_allowed = (any* -- literal_html_closing_tags) $count_newlines;
- literal_html_element := |*
- literal_html_allowed => {
- callback(id_on_text, data, encoding, ts, te);
+ literal_html_allowed = (^'<'+ | '<'+) $count_newlines;
- if ( lines > 0 )
- {
- advance_line(lines);
+ html_script := |*
+ literal_html_allowed => emit_text;
+ '' => close_element_fnext_main;
+ *|;
- lines = 0;
- }
- };
-
- literal_html_allowed %{ mark = p; } literal_html_closing_tags => {
- callback(id_on_text, data, encoding, ts, mark);
-
- p = mark - 1;
- mark = 0;
-
- if ( lines > 0 )
- {
- advance_line(lines);
-
- lines = 0;
- }
-
- fnext main;
- };
+ html_style := |*
+ literal_html_allowed => emit_text;
+ '' => close_element_fnext_main;
*|;
# The main machine aka the entry point of Ragel.
diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb
index e9ea474..4b05b29 100644
--- a/lib/oga/xml/lexer.rb
+++ b/lib/oga/xml/lexer.rb
@@ -40,12 +40,18 @@ module Oga
class Lexer
attr_reader :html
+ # @return [String]
+ HTML_SCRIPT = 'script'
+
+ # @return [String]
+ HTML_STYLE = 'style'
+
##
# Names of HTML tags of which the content should be lexed as-is.
#
# @return [Array]
#
- LITERAL_HTML_ELEMENTS = %w{script style}
+ LITERAL_HTML_ELEMENTS = [HTML_SCRIPT, HTML_STYLE]
##
# @param [String|IO] data The data to lex. This can either be a String or
@@ -189,12 +195,17 @@ module Oga
end
##
- # Returns true if the current element's content should be lexed as-is.
- #
# @return [TrueClass|FalseClass]
#
- def literal_html_element?
- return html? && LITERAL_HTML_ELEMENTS.include?(current_element)
+ def html_script?
+ return html? && current_element == HTML_SCRIPT
+ end
+
+ ##
+ # @return [TrueClass|FalseClass]
+ #
+ def html_style?
+ return html? && current_element == HTML_STYLE
end
##
diff --git a/lib/oga/xml/parser.rll b/lib/oga/xml/parser.rll
index 4657762..3343ebd 100644
--- a/lib/oga/xml/parser.rll
+++ b/lib/oga/xml/parser.rll
@@ -176,7 +176,17 @@ xml_decl
# Plain text
text
- = T_TEXT { on_text(val[0]) }
+ = T_TEXT text_follow
+ {
+ text = val[1] ? val[0] + val[1] : val[0]
+
+ on_text(text)
+ }
+ ;
+
+text_follow
+ = T_TEXT text_follow { val[1] ? val[0] + val[1] : val[0] }
+ | _ { nil }
;
# Strings
diff --git a/spec/oga/xml/lexer/html_script_spec.rb b/spec/oga/xml/lexer/html_script_spec.rb
index 539dc10..c958e54 100644
--- a/spec/oga/xml/lexer/html_script_spec.rb
+++ b/spec/oga/xml/lexer/html_script_spec.rb
@@ -3,10 +3,24 @@ require 'spec_helper'
describe Oga::XML::Lexer do
describe 'HTML script elements' do
it 'treats the content of a script tag as plain text' do
- lex('', :html => true).should == [
+ lex_html('').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'script', 1],
- [:T_TEXT, 'foo ').should == [
+ [:T_ELEM_START, nil, 1],
+ [:T_ELEM_NAME, 'script', 1],
+ [:T_TEXT, '<', 1],
+ [:T_TEXT, 'style>', 1],
+ [:T_TEXT, '<', 1],
+ [:T_TEXT, '/style>', 1],
[:T_ELEM_END, nil, 1]
]
end
diff --git a/spec/oga/xml/lexer/html_style_spec.rb b/spec/oga/xml/lexer/html_style_spec.rb
index 6ac7353..310ee03 100644
--- a/spec/oga/xml/lexer/html_style_spec.rb
+++ b/spec/oga/xml/lexer/html_style_spec.rb
@@ -3,7 +3,7 @@ require 'spec_helper'
describe Oga::XML::Lexer do
describe 'HTML style elements' do
it 'lexes an empty ', :html => true).should == [
+ lex_html('').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'style', 1],
[:T_ELEM_END, nil, 1]
@@ -11,16 +11,30 @@ describe Oga::XML::Lexer do
end
it 'treats the content of a style tag as plain text' do
- lex('', :html => true).should == [
+ lex_html('').should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'style', 1],
- [:T_TEXT, 'foo ').should == [
+ [:T_ELEM_START, nil, 1],
+ [:T_ELEM_NAME, 'style', 1],
+ [:T_TEXT, '<', 1],
+ [:T_TEXT, 'script>', 1],
+ [:T_TEXT, '<', 1],
+ [:T_TEXT, '/script>', 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes a multi-line ", :html => true).should == [
+ lex_html("").should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'style', 1],
[:T_TEXT, "foo\nbar", 1],
@@ -29,9 +43,7 @@ describe Oga::XML::Lexer do
end
it 'lexes a multi-line ")
-
- lex(io, :html => true).should == [
+ lex_stringio("", :html => true).should == [
[:T_ELEM_START, nil, 1],
[:T_ELEM_NAME, 'style', 1],
[:T_TEXT, "foo\n", 1],