Support for Unicode XML/HTML identifiers

Technically HTML only allows for ASCII names but restricting that
actually requires more work than just allowing it.
This commit is contained in:
Yorick Peterse 2015-06-29 21:08:01 +02:00
parent 139985612b
commit dde644cd79
2 changed files with 33 additions and 4 deletions

View File

@ -48,7 +48,9 @@
newline = '\r\n' | '\n' | '\r'; newline = '\r\n' | '\n' | '\r';
whitespace = [ \t]; whitespace = [ \t];
ident_char = [a-zA-Z0-9\-_\.];
unicode = any - ascii;
ident_char = unicode | [a-zA-Z0-9\-_\.];
identifier = ident_char+; identifier = ident_char+;
whitespace_or_newline = whitespace | newline; whitespace_or_newline = whitespace | newline;

View File

@ -307,12 +307,39 @@ describe Oga::XML::Lexer do
] ]
end end
end end
it 'lexes an element with inline dots' do it 'lexes an element with inline dots' do
lex('<SOAP..TestMapping..MappablePerson>').should == [ lex('<SOAP..TestMapping..MappablePerson>').should == [
[:T_ELEM_NAME, "SOAP..TestMapping..MappablePerson", 1], [:T_ELEM_NAME, "SOAP..TestMapping..MappablePerson", 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes an element with a name containing Unicode characters' do
lex('<foobár />').should == [
[:T_ELEM_NAME, 'foobár', 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes an element with a name containing an underscore' do
lex('<foo_bar />').should == [
[:T_ELEM_NAME, 'foo_bar', 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes an element with a name containing a dash' do
lex('<foo-bar />').should == [
[:T_ELEM_NAME, 'foo-bar', 1],
[:T_ELEM_END, nil, 1]
]
end
it 'lexes an element with a name containing numbers' do
lex('<foo123 />').should == [
[:T_ELEM_NAME, 'foo123', 1],
[:T_ELEM_END, nil, 1] [:T_ELEM_END, nil, 1]
] ]
end end
end end