Lexing of comments.

This fixes #4.
This commit is contained in:
Yorick Peterse 2014-02-28 23:27:23 +01:00
parent 92ae48f905
commit d9ef33e1f8
2 changed files with 48 additions and 17 deletions

View File

@ -197,14 +197,34 @@ module Oga
cdata := |* cdata := |*
cdata_end => { cdata_end => {
emit_text_buffer emit_text_buffer
t(:T_CDATA_END) t(:T_CDATA_END)
fret; fret;
}; };
# Consume everything else character by character and store it in a any => buffer_text;
# separate buffer. *|;
# Comments
#
# http://www.w3.org/TR/html-markup/syntax.html#comments
#
# Comments are lexed into 3 parts: the start tag, the content and the end
# tag.
#
# Unlike the W3 specification these rules *do* allow character sequences
# such as `--` and `->`. Putting extra checks in for these sequences
# would actually make the rules/actions more complex.
#
comment_start = '<!--';
comment_end = '-->';
comment := |*
comment_end => {
emit_text_buffer
t(:T_COMMENT_END)
fret;
};
any => buffer_text; any => buffer_text;
*|; *|;
@ -214,17 +234,19 @@ module Oga
doctype_start => { doctype_start => {
t(:T_DOCTYPE_START) t(:T_DOCTYPE_START)
fcall doctype; fcall doctype;
}; };
# @cdata_buffer is used to store the content of the CDATA tag.
cdata_start => { cdata_start => {
t(:T_CDATA_START) t(:T_CDATA_START)
fcall cdata; fcall cdata;
}; };
comment_start => {
t(:T_COMMENT_START)
fcall comment;
};
# General rules and actions. # General rules and actions.
'<' => { t(:T_SMALLER) }; '<' => { t(:T_SMALLER) };
'>' => { t(:T_GREATER) }; '>' => { t(:T_GREATER) };

View File

@ -4,16 +4,25 @@ describe Oga::Lexer do
context 'comments' do context 'comments' do
example 'lex a comment' do example 'lex a comment' do
lex('<!-- foo -->').should == [ lex('<!-- foo -->').should == [
[:T_SMALLER, '<', 1, 1], [:T_COMMENT_START, '<!--', 1, 1],
[:T_BANG, '!', 1, 2], [:T_TEXT, ' foo ', 1, 5],
[:T_DASH, '-', 1, 3], [:T_COMMENT_END, '-->', 1, 10]
[:T_DASH, '-', 1, 4], ]
[:T_SPACE, ' ', 1, 5], end
[:T_TEXT, 'foo', 1, 6],
[:T_SPACE, ' ', 1, 9], example 'lex a comment containing --' do
[:T_DASH, '-', 1, 10], lex('<!-- -- -->').should == [
[:T_DASH, '-', 1, 11], [:T_COMMENT_START, '<!--', 1, 1],
[:T_GREATER, '>', 1, 12] [:T_TEXT, ' -- ', 1, 5],
[:T_COMMENT_END, '-->', 1, 9]
]
end
example 'lex a comment containing ->' do
lex('<!-- -> -->').should == [
[:T_COMMENT_START, '<!--', 1, 1],
[:T_TEXT, ' -> ', 1, 5],
[:T_COMMENT_END, '-->', 1, 9]
] ]
end end
end end