Support for Unicode XML/HTML identifiers
Technically HTML only allows for ASCII names but restricting that actually requires more work than just allowing it.
This commit is contained in:
parent
139985612b
commit
dde644cd79
|
@ -48,7 +48,9 @@
|
|||
|
||||
newline = '\r\n' | '\n' | '\r';
|
||||
whitespace = [ \t];
|
||||
ident_char = [a-zA-Z0-9\-_\.];
|
||||
|
||||
unicode = any - ascii;
|
||||
ident_char = unicode | [a-zA-Z0-9\-_\.];
|
||||
identifier = ident_char+;
|
||||
|
||||
whitespace_or_newline = whitespace | newline;
|
||||
|
|
|
@ -315,4 +315,31 @@ describe Oga::XML::Lexer do
|
|||
]
|
||||
end
|
||||
|
||||
it 'lexes an element with a name containing Unicode characters' do
|
||||
lex('<foobár />').should == [
|
||||
[:T_ELEM_NAME, 'foobár', 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes an element with a name containing an underscore' do
|
||||
lex('<foo_bar />').should == [
|
||||
[:T_ELEM_NAME, 'foo_bar', 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes an element with a name containing a dash' do
|
||||
lex('<foo-bar />').should == [
|
||||
[:T_ELEM_NAME, 'foo-bar', 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
|
||||
it 'lexes an element with a name containing numbers' do
|
||||
lex('<foo123 />').should == [
|
||||
[:T_ELEM_NAME, 'foo123', 1],
|
||||
[:T_ELEM_END, nil, 1]
|
||||
]
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue