Support for Unicode XML/HTML identifiers
Technically HTML only allows for ASCII names but restricting that actually requires more work than just allowing it.
This commit is contained in:
parent
139985612b
commit
dde644cd79
|
@ -48,7 +48,9 @@
|
||||||
|
|
||||||
newline = '\r\n' | '\n' | '\r';
|
newline = '\r\n' | '\n' | '\r';
|
||||||
whitespace = [ \t];
|
whitespace = [ \t];
|
||||||
ident_char = [a-zA-Z0-9\-_\.];
|
|
||||||
|
unicode = any - ascii;
|
||||||
|
ident_char = unicode | [a-zA-Z0-9\-_\.];
|
||||||
identifier = ident_char+;
|
identifier = ident_char+;
|
||||||
|
|
||||||
whitespace_or_newline = whitespace | newline;
|
whitespace_or_newline = whitespace | newline;
|
||||||
|
|
|
@ -315,4 +315,31 @@ describe Oga::XML::Lexer do
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it 'lexes an element with a name containing Unicode characters' do
|
||||||
|
lex('<foobár />').should == [
|
||||||
|
[:T_ELEM_NAME, 'foobár', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'lexes an element with a name containing an underscore' do
|
||||||
|
lex('<foo_bar />').should == [
|
||||||
|
[:T_ELEM_NAME, 'foo_bar', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'lexes an element with a name containing a dash' do
|
||||||
|
lex('<foo-bar />').should == [
|
||||||
|
[:T_ELEM_NAME, 'foo-bar', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'lexes an element with a name containing numbers' do
|
||||||
|
lex('<foo123 />').should == [
|
||||||
|
[:T_ELEM_NAME, 'foo123', 1],
|
||||||
|
[:T_ELEM_END, nil, 1]
|
||||||
|
]
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue