Lexing of comments.

This fixes #4.
This commit is contained in:
Yorick Peterse 2014-02-28 23:27:23 +01:00
parent 92ae48f905
commit d9ef33e1f8
2 changed files with 48 additions and 17 deletions

View File

@ -197,14 +197,34 @@ module Oga
cdata := |*
cdata_end => {
emit_text_buffer
t(:T_CDATA_END)
fret;
};
# Consume everything else character by character and store it in a
# separate buffer.
any => buffer_text;
*|;
# Comments
#
# http://www.w3.org/TR/html-markup/syntax.html#comments
#
# Comments are lexed into 3 parts: the start tag, the content and the end
# tag.
#
# Unlike the W3 specification these rules *do* allow character sequences
# such as `--` and `->`. Putting extra checks in for these sequences
# would actually make the rules/actions more complex.
#
comment_start = '<!--';
comment_end = '-->';
comment := |*
comment_end => {
emit_text_buffer
t(:T_COMMENT_END)
fret;
};
any => buffer_text;
*|;
@ -214,17 +234,19 @@ module Oga
doctype_start => {
t(:T_DOCTYPE_START)
fcall doctype;
};
# @cdata_buffer is used to store the content of the CDATA tag.
cdata_start => {
t(:T_CDATA_START)
fcall cdata;
};
comment_start => {
t(:T_COMMENT_START)
fcall comment;
};
# General rules and actions.
'<' => { t(:T_SMALLER) };
'>' => { t(:T_GREATER) };

View File

@ -4,16 +4,25 @@ describe Oga::Lexer do
context 'comments' do
example 'lex a comment' do
lex('<!-- foo -->').should == [
[:T_SMALLER, '<', 1, 1],
[:T_BANG, '!', 1, 2],
[:T_DASH, '-', 1, 3],
[:T_DASH, '-', 1, 4],
[:T_SPACE, ' ', 1, 5],
[:T_TEXT, 'foo', 1, 6],
[:T_SPACE, ' ', 1, 9],
[:T_DASH, '-', 1, 10],
[:T_DASH, '-', 1, 11],
[:T_GREATER, '>', 1, 12]
[:T_COMMENT_START, '<!--', 1, 1],
[:T_TEXT, ' foo ', 1, 5],
[:T_COMMENT_END, '-->', 1, 10]
]
end
example 'lex a comment containing --' do
lex('<!-- -- -->').should == [
[:T_COMMENT_START, '<!--', 1, 1],
[:T_TEXT, ' -- ', 1, 5],
[:T_COMMENT_END, '-->', 1, 9]
]
end
example 'lex a comment containing ->' do
lex('<!-- -> -->').should == [
[:T_COMMENT_START, '<!--', 1, 1],
[:T_TEXT, ' -> ', 1, 5],
[:T_COMMENT_END, '-->', 1, 9]
]
end
end