Unicode support for CSS/XPath

Fixes #140
This commit is contained in:
Yorick Peterse 2015-09-03 11:21:45 +02:00
parent 44630c27ff
commit 37c5b819fa
4 changed files with 33 additions and 4 deletions

View File

@ -144,11 +144,16 @@ module Oga
# Identifiers are used for element and attribute names. Identifiers have # Identifiers are used for element and attribute names. Identifiers have
# to start with a letter. # to start with a letter.
ident_word = [a-zA-Z\-_0-9]*; unicode = any - ascii;
ident_escape = '\\.' %{ @escaped = true }; unicode_or_ascii = (unicode | [a-zA-Z\-_0-9])*;
identifier = '*' | [a-zA-Z_]+ ident_word (ident_escape ident_word)*; escaped_dot = '\\.' %{ @escaped = true };
identifier
= '*'
| (unicode | [a-zA-Z_]) unicode_or_ascii (escaped_dot unicode_or_ascii)*
;
action emit_identifier { action emit_identifier {
value = slice_input(ts, te) value = slice_input(ts, te)

View File

@ -176,7 +176,11 @@ module Oga
# Identifiers are used for element names, namespaces, attribute names, # Identifiers are used for element names, namespaces, attribute names,
# etc. Identifiers have to start with a letter. # etc. Identifiers have to start with a letter.
identifier = '*' | [a-zA-Z_]+ [a-zA-Z\-_0-9]*; unicode = any - ascii;
unicode_or_ascii = (unicode | [a-zA-Z\-_0-9\.])*;
identifier = '*' | (unicode | [a-zA-Z_]) unicode_or_ascii ;
action emit_identifier { action emit_identifier {
emit(:T_IDENT, ts, te) emit(:T_IDENT, ts, te)

View File

@ -6,6 +6,14 @@ describe Oga::CSS::Lexer do
lex_css('h3').should == [[:T_IDENT, 'h3']] lex_css('h3').should == [[:T_IDENT, 'h3']]
end end
it 'lexes a path with Unicode characters' do
lex_css('áâã').should == [[:T_IDENT, 'áâã']]
end
it 'lexes a path with Unicode and ASCII characters' do
lex_css('áâãfoo').should == [[:T_IDENT, 'áâãfoo']]
end
it 'lexes a simple path starting with an underscore' do it 'lexes a simple path starting with an underscore' do
lex_css('_h3').should == [[:T_IDENT, '_h3']] lex_css('_h3').should == [[:T_IDENT, '_h3']]
end end

View File

@ -6,6 +6,18 @@ describe Oga::XPath::Lexer do
lex_xpath('/foo').should == [[:T_SLASH, nil], [:T_IDENT, 'foo']] lex_xpath('/foo').should == [[:T_SLASH, nil], [:T_IDENT, 'foo']]
end end
it 'lexes an expression using Unicode identifiers' do
lex_xpath('fóó').should == [[:T_IDENT, 'fóó']]
end
it 'lexes an expression using Unicode plus ASCII identifiers' do
lex_xpath('fóóbar').should == [[:T_IDENT, 'fóóbar']]
end
it 'lexes an expression using an identifier with a dot' do
lex_xpath('foo.bar').should == [[:T_IDENT, 'foo.bar']]
end
it 'lexes a simple expression with a test starting with an underscore' do it 'lexes a simple expression with a test starting with an underscore' do
lex_xpath('/_foo').should == [[:T_SLASH, nil], [:T_IDENT, '_foo']] lex_xpath('/_foo').should == [[:T_SLASH, nil], [:T_IDENT, '_foo']]
end end