From 44630c27ffe5626619b4628f340e9168b8430c69 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Wed, 2 Sep 2015 19:51:09 +0200 Subject: [PATCH] Support escaping dots in CSS identifiers Escaping hash characters and whitespace is _not_ supported as neither are valid element/attribute names (e.g. is invalid XML/HTML). Escaping single/double quotes also won't be supported for the time being. It's quite a pain to get this to work right in not just CSS but also XPath and XML/HTML, for very little gain. Should there be enough users with an actual use case (other than "But the spec says ...!") I'll look into this again. Fixes #124 --- lib/oga/css/lexer.rl | 19 ++++++++++++++++--- spec/oga/css/lexer/paths_spec.rb | 12 ++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/lib/oga/css/lexer.rl b/lib/oga/css/lexer.rl index dde2b7e..b36cbb6 100644 --- a/lib/oga/css/lexer.rl +++ b/lib/oga/css/lexer.rl @@ -48,7 +48,8 @@ module Oga # # @see [#add_token] def advance(&block) - @block = block + @block = block + @escaped = false data = @data # saves ivar lookups while lexing. ts = nil @@ -143,10 +144,22 @@ module Oga # Identifiers are used for element and attribute names. Identifiers have # to start with a letter. - identifier = '*' | [a-zA-Z_]+ [a-zA-Z\-_0-9]*; + ident_word = [a-zA-Z\-_0-9]*; + + ident_escape = '\\.' %{ @escaped = true }; + + identifier = '*' | [a-zA-Z_]+ ident_word (ident_escape ident_word)*; action emit_identifier { - emit(:T_IDENT, ts, te) + value = slice_input(ts, te) + + # Translates "foo\.bar" into "foo.bar" + if @escaped + value = value.gsub('\.', '.') + @escaped = false + end + + add_token(:T_IDENT, value) } # Operators diff --git a/spec/oga/css/lexer/paths_spec.rb b/spec/oga/css/lexer/paths_spec.rb index de7c3d3..6e6c0da 100644 --- a/spec/oga/css/lexer/paths_spec.rb +++ b/spec/oga/css/lexer/paths_spec.rb @@ -10,6 +10,18 @@ describe Oga::CSS::Lexer do lex_css('_h3').should == [[:T_IDENT, '_h3']] end + it 'lexes a path with an escaped identifier' do + lex_css('foo\.bar\.baz').should == [[:T_IDENT, 'foo.bar.baz']] + end + + it 'lexes a path with an escaped identifier followed by another identifier' do + lex_css('foo\.bar baz').should == [ + [:T_IDENT, 'foo.bar'], + [:T_SPACE, nil], + [:T_IDENT, 'baz'] + ] + end + it 'lexes a path with two members' do lex_css('div h3').should == [ [:T_IDENT, 'div'],