From 37c5b819fafa11dc35ed366b62731c1ca6a3b640 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Thu, 3 Sep 2015 11:21:45 +0200 Subject: [PATCH] Unicode support for CSS/XPath Fixes #140 --- lib/oga/css/lexer.rl | 11 ++++++++--- lib/oga/xpath/lexer.rl | 6 +++++- spec/oga/css/lexer/paths_spec.rb | 8 ++++++++ spec/oga/xpath/lexer/general_spec.rb | 12 ++++++++++++ 4 files changed, 33 insertions(+), 4 deletions(-) diff --git a/lib/oga/css/lexer.rl b/lib/oga/css/lexer.rl index b36cbb6..a9416e2 100644 --- a/lib/oga/css/lexer.rl +++ b/lib/oga/css/lexer.rl @@ -144,11 +144,16 @@ module Oga # Identifiers are used for element and attribute names. Identifiers have # to start with a letter. - ident_word = [a-zA-Z\-_0-9]*; + unicode = any - ascii; - ident_escape = '\\.' %{ @escaped = true }; + unicode_or_ascii = (unicode | [a-zA-Z\-_0-9])*; - identifier = '*' | [a-zA-Z_]+ ident_word (ident_escape ident_word)*; + escaped_dot = '\\.' %{ @escaped = true }; + + identifier + = '*' + | (unicode | [a-zA-Z_]) unicode_or_ascii (escaped_dot unicode_or_ascii)* + ; action emit_identifier { value = slice_input(ts, te) diff --git a/lib/oga/xpath/lexer.rl b/lib/oga/xpath/lexer.rl index 821b398..1192d13 100644 --- a/lib/oga/xpath/lexer.rl +++ b/lib/oga/xpath/lexer.rl @@ -176,7 +176,11 @@ module Oga # Identifiers are used for element names, namespaces, attribute names, # etc. Identifiers have to start with a letter. - identifier = '*' | [a-zA-Z_]+ [a-zA-Z\-_0-9]*; + unicode = any - ascii; + + unicode_or_ascii = (unicode | [a-zA-Z\-_0-9\.])*; + + identifier = '*' | (unicode | [a-zA-Z_]) unicode_or_ascii ; action emit_identifier { emit(:T_IDENT, ts, te) diff --git a/spec/oga/css/lexer/paths_spec.rb b/spec/oga/css/lexer/paths_spec.rb index 6e6c0da..2d85400 100644 --- a/spec/oga/css/lexer/paths_spec.rb +++ b/spec/oga/css/lexer/paths_spec.rb @@ -6,6 +6,14 @@ describe Oga::CSS::Lexer do lex_css('h3').should == [[:T_IDENT, 'h3']] end + it 'lexes a path with Unicode characters' do + lex_css('áâã').should == [[:T_IDENT, 'áâã']] + end + + it 'lexes a path with Unicode and ASCII characters' do + lex_css('áâãfoo').should == [[:T_IDENT, 'áâãfoo']] + end + it 'lexes a simple path starting with an underscore' do lex_css('_h3').should == [[:T_IDENT, '_h3']] end diff --git a/spec/oga/xpath/lexer/general_spec.rb b/spec/oga/xpath/lexer/general_spec.rb index 2f77c82..6926249 100644 --- a/spec/oga/xpath/lexer/general_spec.rb +++ b/spec/oga/xpath/lexer/general_spec.rb @@ -6,6 +6,18 @@ describe Oga::XPath::Lexer do lex_xpath('/foo').should == [[:T_SLASH, nil], [:T_IDENT, 'foo']] end + it 'lexes an expression using Unicode identifiers' do + lex_xpath('fóó').should == [[:T_IDENT, 'fóó']] + end + + it 'lexes an expression using Unicode plus ASCII identifiers' do + lex_xpath('fóóbar').should == [[:T_IDENT, 'fóóbar']] + end + + it 'lexes an expression using an identifier with a dot' do + lex_xpath('foo.bar').should == [[:T_IDENT, 'foo.bar']] + end + it 'lexes a simple expression with a test starting with an underscore' do lex_xpath('/_foo').should == [[:T_SLASH, nil], [:T_IDENT, '_foo']] end