Support escaping dots in CSS identifiers

Escaping hash characters and whitespace is _not_ supported as neither
are valid element/attribute names (e.g. <foo#bar /> is invalid
XML/HTML).

Escaping single/double quotes also won't be supported for the time
being. It's quite a pain to get this to work right in not just CSS but
also XPath and XML/HTML, for very little gain. Should there be enough
users with an actual use case (other than "But the spec says ...!") I'll
look into this again.

Fixes #124
This commit is contained in:
Yorick Peterse 2015-09-02 19:51:09 +02:00
parent aef7c510c2
commit 44630c27ff
2 changed files with 28 additions and 3 deletions

View File

@ -49,6 +49,7 @@ module Oga
# @see [#add_token]
def advance(&block)
@block = block
@escaped = false
data = @data # saves ivar lookups while lexing.
ts = nil
@ -143,10 +144,22 @@ module Oga
# Identifiers are used for element and attribute names. Identifiers have
# to start with a letter.
identifier = '*' | [a-zA-Z_]+ [a-zA-Z\-_0-9]*;
ident_word = [a-zA-Z\-_0-9]*;
ident_escape = '\\.' %{ @escaped = true };
identifier = '*' | [a-zA-Z_]+ ident_word (ident_escape ident_word)*;
action emit_identifier {
emit(:T_IDENT, ts, te)
value = slice_input(ts, te)
# Translates "foo\.bar" into "foo.bar"
if @escaped
value = value.gsub('\.', '.')
@escaped = false
end
add_token(:T_IDENT, value)
}
# Operators

View File

@ -10,6 +10,18 @@ describe Oga::CSS::Lexer do
lex_css('_h3').should == [[:T_IDENT, '_h3']]
end
it 'lexes a path with an escaped identifier' do
lex_css('foo\.bar\.baz').should == [[:T_IDENT, 'foo.bar.baz']]
end
it 'lexes a path with an escaped identifier followed by another identifier' do
lex_css('foo\.bar baz').should == [
[:T_IDENT, 'foo.bar'],
[:T_SPACE, nil],
[:T_IDENT, 'baz']
]
end
it 'lexes a path with two members' do
lex_css('div h3').should == [
[:T_IDENT, 'div'],