Better lexing of CDATA tags.
This means the lexer is now capable of lexing CDATA tags that contain text such as ]].
This commit is contained in:
parent
6138945d53
commit
2294bf19f4
|
@ -56,7 +56,16 @@ module Oga
|
||||||
end
|
end
|
||||||
|
|
||||||
def t(type, start = @ts, stop = @te)
|
def t(type, start = @ts, stop = @te)
|
||||||
value = @data[start...stop]
|
value = text(start, stop)
|
||||||
|
|
||||||
|
add_token(type, value)
|
||||||
|
end
|
||||||
|
|
||||||
|
def text(start = @ts, stop = @te)
|
||||||
|
return @data[start...stop]
|
||||||
|
end
|
||||||
|
|
||||||
|
def add_token(type, value)
|
||||||
token = [type, value, @line, @column]
|
token = [type, value, @line, @column]
|
||||||
|
|
||||||
advance_column(value.length)
|
advance_column(value.length)
|
||||||
|
@ -108,16 +117,6 @@ module Oga
|
||||||
doctype = smaller whitespace* bang whitespace* 'DOCTYPE'i whitespace*
|
doctype = smaller whitespace* bang whitespace* 'DOCTYPE'i whitespace*
|
||||||
'HTML'i whitespace* any* greater;
|
'HTML'i whitespace* any* greater;
|
||||||
|
|
||||||
# CDATA
|
|
||||||
cdata_start = smaller bang lbracket 'CDATA' lbracket;
|
|
||||||
cdata_end = rbracket rbracket greater;
|
|
||||||
|
|
||||||
main := |*
|
|
||||||
whitespace => { t(:T_SPACE) };
|
|
||||||
newline => { t(:T_NEWLINE); advance_line };
|
|
||||||
|
|
||||||
doctype => { t(:T_DOCTYPE) };
|
|
||||||
|
|
||||||
# CDATA
|
# CDATA
|
||||||
#
|
#
|
||||||
# http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
|
# http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
|
||||||
|
@ -128,25 +127,39 @@ module Oga
|
||||||
# In HTML CDATA tags have no meaning/are not supported. Oga does
|
# In HTML CDATA tags have no meaning/are not supported. Oga does
|
||||||
# support them but treats their contents as plain text.
|
# support them but treats their contents as plain text.
|
||||||
#
|
#
|
||||||
cdata_start
|
cdata_start = smaller bang lbracket 'CDATA' lbracket;
|
||||||
%{
|
cdata_end = rbracket rbracket greater;
|
||||||
@cdata_start = p
|
|
||||||
t(:T_CDATA_START, @ts, p)
|
|
||||||
}
|
|
||||||
|
|
||||||
# Consume everything except ], which is the start of the ending tag.
|
cdata := |*
|
||||||
(any - rbracket)+
|
cdata_start => {
|
||||||
%{
|
t(:T_CDATA_START)
|
||||||
t(:T_TEXT, @cdata_start, p)
|
|
||||||
|
|
||||||
@cdata_start = nil
|
@cdata_buffer = ''
|
||||||
}
|
|
||||||
|
|
||||||
cdata_end
|
|
||||||
>{
|
|
||||||
t(:T_CDATA_END, p, pe)
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
cdata_end => {
|
||||||
|
add_token(:T_TEXT, @cdata_buffer)
|
||||||
|
@cdata_buffer = nil
|
||||||
|
|
||||||
|
t(:T_CDATA_END)
|
||||||
|
|
||||||
|
fgoto main;
|
||||||
|
};
|
||||||
|
|
||||||
|
# Consume everything else character by character and store it in a
|
||||||
|
# separate buffer.
|
||||||
|
any => { @cdata_buffer << text };
|
||||||
|
*|;
|
||||||
|
|
||||||
|
main := |*
|
||||||
|
whitespace => { t(:T_SPACE) };
|
||||||
|
newline => { t(:T_NEWLINE); advance_line };
|
||||||
|
|
||||||
|
doctype => { t(:T_DOCTYPE) };
|
||||||
|
|
||||||
|
# Jump to the cdata machine right away without processing anything.
|
||||||
|
cdata_start >{ fhold; fgoto cdata; };
|
||||||
|
|
||||||
# General rules and actions.
|
# General rules and actions.
|
||||||
smaller => { t(:T_SMALLER) };
|
smaller => { t(:T_SMALLER) };
|
||||||
greater => { t(:T_GREATER) };
|
greater => { t(:T_GREATER) };
|
||||||
|
|
|
@ -17,5 +17,13 @@ describe Oga::Lexer do
|
||||||
[:T_CDATA_END, ']]>', 1, 20]
|
[:T_CDATA_END, ']]>', 1, 20]
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
example 'lex double brackets inside a CDATA tag' do
|
||||||
|
lex('<![CDATA[]]]]>').should == [
|
||||||
|
[:T_CDATA_START, '<![CDATA[', 1, 1],
|
||||||
|
[:T_TEXT, ']]', 1, 10],
|
||||||
|
[:T_CDATA_END, ']]>', 1, 12]
|
||||||
|
]
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue