Better lexing of CDATA tags.

This means the lexer is now capable of lexing CDATA tags that contain text such
as ]].
This commit is contained in:
Yorick Peterse 2014-02-28 20:05:12 +01:00
parent 6138945d53
commit 2294bf19f4
2 changed files with 50 additions and 29 deletions

View File

@ -56,7 +56,16 @@ module Oga
end end
def t(type, start = @ts, stop = @te) def t(type, start = @ts, stop = @te)
value = @data[start...stop] value = text(start, stop)
add_token(type, value)
end
def text(start = @ts, stop = @te)
return @data[start...stop]
end
def add_token(type, value)
token = [type, value, @line, @column] token = [type, value, @line, @column]
advance_column(value.length) advance_column(value.length)
@ -108,16 +117,6 @@ module Oga
doctype = smaller whitespace* bang whitespace* 'DOCTYPE'i whitespace* doctype = smaller whitespace* bang whitespace* 'DOCTYPE'i whitespace*
'HTML'i whitespace* any* greater; 'HTML'i whitespace* any* greater;
# CDATA
cdata_start = smaller bang lbracket 'CDATA' lbracket;
cdata_end = rbracket rbracket greater;
main := |*
whitespace => { t(:T_SPACE) };
newline => { t(:T_NEWLINE); advance_line };
doctype => { t(:T_DOCTYPE) };
# CDATA # CDATA
# #
# http://www.w3.org/TR/html-markup/syntax.html#cdata-sections # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
@ -128,25 +127,39 @@ module Oga
# In HTML CDATA tags have no meaning/are not supported. Oga does # In HTML CDATA tags have no meaning/are not supported. Oga does
# support them but treats their contents as plain text. # support them but treats their contents as plain text.
# #
cdata_start cdata_start = smaller bang lbracket 'CDATA' lbracket;
%{ cdata_end = rbracket rbracket greater;
@cdata_start = p
t(:T_CDATA_START, @ts, p)
}
# Consume everything except ], which is the start of the ending tag. cdata := |*
(any - rbracket)+ cdata_start => {
%{ t(:T_CDATA_START)
t(:T_TEXT, @cdata_start, p)
@cdata_start = nil @cdata_buffer = ''
}
cdata_end
>{
t(:T_CDATA_END, p, pe)
}; };
cdata_end => {
add_token(:T_TEXT, @cdata_buffer)
@cdata_buffer = nil
t(:T_CDATA_END)
fgoto main;
};
# Consume everything else character by character and store it in a
# separate buffer.
any => { @cdata_buffer << text };
*|;
main := |*
whitespace => { t(:T_SPACE) };
newline => { t(:T_NEWLINE); advance_line };
doctype => { t(:T_DOCTYPE) };
# Jump to the cdata machine right away without processing anything.
cdata_start >{ fhold; fgoto cdata; };
# General rules and actions. # General rules and actions.
smaller => { t(:T_SMALLER) }; smaller => { t(:T_SMALLER) };
greater => { t(:T_GREATER) }; greater => { t(:T_GREATER) };

View File

@ -17,5 +17,13 @@ describe Oga::Lexer do
[:T_CDATA_END, ']]>', 1, 20] [:T_CDATA_END, ']]>', 1, 20]
] ]
end end
example 'lex double brackets inside a CDATA tag' do
lex('<![CDATA[]]]]>').should == [
[:T_CDATA_START, '<![CDATA[', 1, 1],
[:T_TEXT, ']]', 1, 10],
[:T_CDATA_END, ']]>', 1, 12]
]
end
end end
end end