From c4e0406ed9747fc3d2cfd56eee93cdf92059ef20 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Wed, 26 Feb 2014 22:01:07 +0100 Subject: [PATCH] Lexing of CDATA tags. --- lib/oga/lexer.rl | 21 +++++++++++++-------- spec/oga/lexer_spec.rb | 16 ++++++++++++++++ 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/lib/oga/lexer.rl b/lib/oga/lexer.rl index a24bbcd..0518f9e 100644 --- a/lib/oga/lexer.rl +++ b/lib/oga/lexer.rl @@ -73,20 +73,23 @@ module Oga any_escaped = /\\./; - smaller = '<'; - greater = '>'; - slash = '/'; - bang = '!'; - equals = '='; - colon = ':'; - dash = '-'; + smaller = '<'; + greater = '>'; + slash = '/'; + bang = '!'; + equals = '='; + colon = ':'; + dash = '-'; + lbracket = '['; + rbracket = ']'; s_quote = "'"; d_quote = '"'; # FIXME: there really should be a better way of doing this. text = (any - s_quote - d_quote - equals - bang - slash - - greater - smaller - whitespace - newline - colon - dash)+; + greater - smaller - whitespace - newline - colon - dash - + lbracket - rbracket)+; # Unicode characters, taken from whitequark's wonderful parser library. # (I honestly need to buy that dude a beer or 100). Basically this @@ -103,6 +106,8 @@ module Oga d_quote => { t(:T_DQUOTE) }; s_quote => { t(:T_SQUOTE) }; dash => { t(:T_DASH) }; + rbracket => { t(:T_RBRACKET) }; + lbracket => { t(:T_LBRACKET) }; colon => { t(:T_COLON) }; bang => { t(:T_BANG) }; equals => { t(:T_EQUALS) }; diff --git a/spec/oga/lexer_spec.rb b/spec/oga/lexer_spec.rb index 40e0548..ca99113 100644 --- a/spec/oga/lexer_spec.rb +++ b/spec/oga/lexer_spec.rb @@ -116,4 +116,20 @@ describe Oga::Lexer do ] end end + + context 'cdata tags' do + example 'lex a cdata tag' do + lex('').should == [ + [:T_SMALLER, '<', 1, 1], + [:T_BANG, '!', 1, 2], + [:T_LBRACKET, '[', 1, 3], + [:T_TEXT, 'CDATA', 1, 4], + [:T_LBRACKET, '[', 1, 9], + [:T_TEXT, 'foo', 1, 10], + [:T_RBRACKET, ']', 1, 13], + [:T_RBRACKET, ']', 1, 14], + [:T_GREATER, '>', 1, 15], + ] + end + end end