From b40c0243ceb7c9bd4f7832bf28d9da72ba1591a9 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Tue, 7 Oct 2014 22:17:04 +0200 Subject: [PATCH] Tighten up lexing of CSS predicates. Operators can now only occur inside predicates and any whitespcae in these predicates is ignored. --- lib/oga/css/lexer.rl | 60 +++++++++++++++++++--------- spec/oga/css/lexer/integers_spec.rb | 9 ----- spec/oga/css/lexer/operators_spec.rb | 43 +++++++++++++++----- spec/oga/css/lexer/strings_spec.rb | 12 +++++- 4 files changed, 86 insertions(+), 38 deletions(-) delete mode 100644 spec/oga/css/lexer/integers_spec.rb diff --git a/lib/oga/css/lexer.rl b/lib/oga/css/lexer.rl index 79b19c7..24d8512 100644 --- a/lib/oga/css/lexer.rl +++ b/lib/oga/css/lexer.rl @@ -138,16 +138,8 @@ module Oga hash = '#' %{ add_token(:T_HASH) }; dot = '.' %{ add_token(:T_DOT) }; - lbrack = '[' %{ add_token(:T_LBRACK) }; - rbrack = ']' %{ add_token(:T_RBRACK) }; colon = ':' %{ add_token(:T_COLON) }; - lparen = '('; - rparen = ')'; pipe = '|'; - odd = 'odd'; - even = 'even'; - minus = '-'; - nth = 'n'; comma = whitespace* ',' whitespace*; action emit_pipe { @@ -221,6 +213,13 @@ module Oga # # http://www.w3.org/TR/css3-selectors/#structural-pseudos + lparen = '('; + rparen = ')'; + odd = 'odd'; + even = 'even'; + minus = '-'; + nth = 'n'; + action emit_lparen { add_token(:T_LPAREN) @@ -248,8 +247,29 @@ module Oga rparen => emit_rparen; *|; - main := |* - hash | dot | lbrack | rbrack | colon; + # Predicates + # + # CSS predicates can be used to filter nodes based on the value of an + # attribute. + + lbrack = '['; + rbrack = ']'; + + action emit_lbrack { + add_token(:T_LBRACK) + + fnext predicate; + } + + action emit_rbrack { + add_token(:T_RBRACK) + + fnext main; + } + + # Machine used for lexing the body of a CSS predicate. + predicate := |* + whitespace; # Some of the operators have similar characters (e.g. the "="). As a # result we can't use rules like the following: @@ -259,26 +279,30 @@ module Oga # # This would result in both machines being executed for the input # "*=". The syntax below ensures that only the first match is handled. - op_eq => { add_token(:T_EQ) }; op_space_in => { add_token(:T_SPACE_IN) }; op_starts_with => { add_token(:T_STARTS_WITH) }; op_ends_with => { add_token(:T_ENDS_WITH) }; op_in => { add_token(:T_IN) }; op_hyphen_in => { add_token(:T_HYPHEN_IN) }; - op_child => { add_token(:T_CHILD) }; - op_fol_direct => { add_token(:T_FOLLOWING_DIRECT) }; - op_fol => { add_token(:T_FOLLOWING) }; + identifier => emit_identifier; + rbrack => emit_rbrack; + string => emit_string; + *|; - # The pipe character is also used in the |= operator so the action for - # this is handled separately. + main := |* + hash | dot | colon; + + op_child => { add_token(:T_CHILD) }; + op_fol_direct => { add_token(:T_FOLLOWING_DIRECT) }; + op_fol => { add_token(:T_FOLLOWING) }; + + lbrack => emit_lbrack; pipe => emit_pipe; comma => emit_comma; whitespace => emit_whitespace; lparen => emit_lparen; identifier => emit_identifier; - integer => emit_integer; - string => emit_string; any; *|; diff --git a/spec/oga/css/lexer/integers_spec.rb b/spec/oga/css/lexer/integers_spec.rb deleted file mode 100644 index 95e549d..0000000 --- a/spec/oga/css/lexer/integers_spec.rb +++ /dev/null @@ -1,9 +0,0 @@ -require 'spec_helper' - -describe Oga::CSS::Lexer do - context 'integers' do - example 'lex an integer' do - lex_css('10').should == [[:T_INT, 10]] - end - end -end diff --git a/spec/oga/css/lexer/operators_spec.rb b/spec/oga/css/lexer/operators_spec.rb index 1bc053e..29ee2c3 100644 --- a/spec/oga/css/lexer/operators_spec.rb +++ b/spec/oga/css/lexer/operators_spec.rb @@ -3,35 +3,60 @@ require 'spec_helper' describe Oga::CSS::Lexer do context 'operators' do example 'lex the = operator' do - lex_css('=').should == [[:T_EQ, nil]] + lex_css('[=]').should == [ + [:T_LBRACK, nil], + [:T_EQ, nil], + [:T_RBRACK, nil] + ] end example 'lex the ~= operator' do - lex_css('~=').should == [[:T_SPACE_IN, nil]] + lex_css('[~=]').should == [ + [:T_LBRACK, nil], + [:T_SPACE_IN, nil], + [:T_RBRACK, nil] + ] end example 'lex the ^= operator' do - lex_css('^=').should == [[:T_STARTS_WITH, nil]] + lex_css('[^=]').should == [ + [:T_LBRACK, nil], + [:T_STARTS_WITH, nil], + [:T_RBRACK, nil] + ] end example 'lex the $= operator' do - lex_css('$=').should == [[:T_ENDS_WITH, nil]] + lex_css('[$=]').should == [ + [:T_LBRACK, nil], + [:T_ENDS_WITH, nil], + [:T_RBRACK, nil], + ] end example 'lex the *= operator' do - lex_css('*=').should == [[:T_IN, nil]] + lex_css('[*=]').should == [ + [:T_LBRACK, nil], + [:T_IN, nil], + [:T_RBRACK, nil] + ] end example 'lex an identifier followed by the *= operator' do - lex_css('foo *=').should == [ + lex_css('[foo *=]').should == [ + [:T_LBRACK, nil], [:T_IDENT, 'foo'], - [:T_SPACE, nil], - [:T_IN, nil] + [:T_IN, nil], + [:T_RBRACK, nil] ] end example 'lex the |= operator' do - lex_css('|=').should == [[:T_HYPHEN_IN, nil]] + lex_css('[|=]').should == [ + [:T_LBRACK, nil], + [:T_HYPHEN_IN, nil], + [:T_RBRACK, nil] + ] end end end diff --git a/spec/oga/css/lexer/strings_spec.rb b/spec/oga/css/lexer/strings_spec.rb index 7eb0f3f..20bd62c 100644 --- a/spec/oga/css/lexer/strings_spec.rb +++ b/spec/oga/css/lexer/strings_spec.rb @@ -3,11 +3,19 @@ require 'spec_helper' describe Oga::CSS::Lexer do context 'strings' do example 'lex a single quoted string' do - lex_css("'foo'").should == [[:T_STRING, 'foo']] + lex_css("['foo']").should == [ + [:T_LBRACK, nil], + [:T_STRING, 'foo'], + [:T_RBRACK, nil] + ] end example 'lex a double quoted string' do - lex_css('"foo"').should == [[:T_STRING, 'foo']] + lex_css('["foo"]').should == [ + [:T_LBRACK, nil], + [:T_STRING, 'foo'], + [:T_RBRACK, nil] + ] end end end