From dde644cd7991f5d24e662e0fc4094bd644274046 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Mon, 29 Jun 2015 21:08:01 +0200 Subject: [PATCH] Support for Unicode XML/HTML identifiers Technically HTML only allows for ASCII names but restricting that actually requires more work than just allowing it. --- ext/ragel/base_lexer.rl | 4 +++- spec/oga/xml/lexer/elements_spec.rb | 33 ++++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index d05fc09..e41b40e 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -48,7 +48,9 @@ newline = '\r\n' | '\n' | '\r'; whitespace = [ \t]; - ident_char = [a-zA-Z0-9\-_\.]; + + unicode = any - ascii; + ident_char = unicode | [a-zA-Z0-9\-_\.]; identifier = ident_char+; whitespace_or_newline = whitespace | newline; diff --git a/spec/oga/xml/lexer/elements_spec.rb b/spec/oga/xml/lexer/elements_spec.rb index a0effd3..7bf860b 100644 --- a/spec/oga/xml/lexer/elements_spec.rb +++ b/spec/oga/xml/lexer/elements_spec.rb @@ -307,12 +307,39 @@ describe Oga::XML::Lexer do ] end end - + it 'lexes an element with inline dots' do lex('').should == [ - [:T_ELEM_NAME, "SOAP..TestMapping..MappablePerson", 1], + [:T_ELEM_NAME, "SOAP..TestMapping..MappablePerson", 1], + [:T_ELEM_END, nil, 1] + ] + end + + it 'lexes an element with a name containing Unicode characters' do + lex('').should == [ + [:T_ELEM_NAME, 'foobár', 1], + [:T_ELEM_END, nil, 1] + ] + end + + it 'lexes an element with a name containing an underscore' do + lex('').should == [ + [:T_ELEM_NAME, 'foo_bar', 1], + [:T_ELEM_END, nil, 1] + ] + end + + it 'lexes an element with a name containing a dash' do + lex('').should == [ + [:T_ELEM_NAME, 'foo-bar', 1], + [:T_ELEM_END, nil, 1] + ] + end + + it 'lexes an element with a name containing numbers' do + lex('').should == [ + [:T_ELEM_NAME, 'foo123', 1], [:T_ELEM_END, nil, 1] ] end - end