From 3b633ff41c48c44893e42d3ba29ef7a5e3d70617 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Mon, 29 Jun 2015 16:35:48 +0200 Subject: [PATCH] Relax support for HTML unquoted attribute values This allows for parsing of HTML such as: Here the "href" attribute would have its value set to: lol("javascript") Fixes #119 --- ext/ragel/base_lexer.rl | 56 ++++++++++++++++---------- spec/oga/html/lexer/attributes_spec.rb | 26 ++++++++++++ 2 files changed, 61 insertions(+), 21 deletions(-) diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index 87a99b4..17ee5b4 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -419,18 +419,24 @@ any $count_newlines; *|; - # Characters that can be used for unquoted HTML attribute values. - # See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example - # for more info. - html_unquoted_value = - ^(squote | dquote | whitespace_or_newline) - ^('`' | '=' | '<' | '>' | whitespace_or_newline)+; - # Machine used after matching the "=" of an attribute and just before moving # into the actual attribute value. attribute_pre := |* whitespace_or_newline $count_newlines; + squote | dquote => { + fhold; + + if ( lines > 0 ) + { + advance_line(lines); + + lines = 0; + } + + fnext quoted_attribute_value; + }; + any => { fhold; @@ -443,25 +449,33 @@ if ( html_p ) { - fnext html_attribute_value; + fnext unquoted_attribute_value; } + /* XML doesn't support unquoted attribute values */ else { - fnext xml_attribute_value; + fret; } }; *|; - # Machine used for processing HTML attribute values. - html_attribute_value := |* - squote | dquote => { - fhold; - fnext xml_attribute_value; - }; - - # Unquoted attribute values are lexed as if they were single quoted - # strings. - html_unquoted_value => { + # Machine for processing unquoted HTML attribute values. + # + # The HTML specification describes a set of characters that can be allowed + # in an unquoted value at https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example. + # + # As is always the case with HTML everybody completely ignores this + # specification and thus every library and browser out these is expected to + # support input such as `. + # + # Oga too has to support this, thus the only characters it disallows in + # unquoted attribute values are: + # + # * > (used for terminating open tags) + # * whitespace + # + unquoted_attribute_value := |* + ^('>' | whitespace_or_newline)+ => { callback_simple(id_on_string_squote); callback(id_on_string_body, data, encoding, ts, te); @@ -472,8 +486,8 @@ any => hold_and_return; *|; - # Machine used for processing XML attribute values. - xml_attribute_value := |* + # Machine used for processing quoted XML/HTML attribute values. + quoted_attribute_value := |* # The following two actions use "fnext" instead of "fcall". Combined # with "element_head" using "fcall" to jump to this machine this means # we can return back to "element_head" after processing a single string. diff --git a/spec/oga/html/lexer/attributes_spec.rb b/spec/oga/html/lexer/attributes_spec.rb index 61d8b63..e31c525 100644 --- a/spec/oga/html/lexer/attributes_spec.rb +++ b/spec/oga/html/lexer/attributes_spec.rb @@ -58,6 +58,32 @@ describe Oga::XML::Lexer do ] end + it 'lexes an attribute with an unquoted chunk of Javascript' do + lex_html('').should == [ [:T_ELEM_NAME, 'a', 1],