Tighten up lexing of CSS predicates.

Operators can now only occur inside predicates and any whitespcae in these
predicates is ignored.
This commit is contained in:
Yorick Peterse 2014-10-07 22:17:04 +02:00
parent 625b9eeffd
commit b40c0243ce
4 changed files with 86 additions and 38 deletions

View File

@ -138,16 +138,8 @@ module Oga
hash = '#' %{ add_token(:T_HASH) }; hash = '#' %{ add_token(:T_HASH) };
dot = '.' %{ add_token(:T_DOT) }; dot = '.' %{ add_token(:T_DOT) };
lbrack = '[' %{ add_token(:T_LBRACK) };
rbrack = ']' %{ add_token(:T_RBRACK) };
colon = ':' %{ add_token(:T_COLON) }; colon = ':' %{ add_token(:T_COLON) };
lparen = '(';
rparen = ')';
pipe = '|'; pipe = '|';
odd = 'odd';
even = 'even';
minus = '-';
nth = 'n';
comma = whitespace* ',' whitespace*; comma = whitespace* ',' whitespace*;
action emit_pipe { action emit_pipe {
@ -221,6 +213,13 @@ module Oga
# #
# http://www.w3.org/TR/css3-selectors/#structural-pseudos # http://www.w3.org/TR/css3-selectors/#structural-pseudos
lparen = '(';
rparen = ')';
odd = 'odd';
even = 'even';
minus = '-';
nth = 'n';
action emit_lparen { action emit_lparen {
add_token(:T_LPAREN) add_token(:T_LPAREN)
@ -248,8 +247,29 @@ module Oga
rparen => emit_rparen; rparen => emit_rparen;
*|; *|;
main := |* # Predicates
hash | dot | lbrack | rbrack | colon; #
# CSS predicates can be used to filter nodes based on the value of an
# attribute.
lbrack = '[';
rbrack = ']';
action emit_lbrack {
add_token(:T_LBRACK)
fnext predicate;
}
action emit_rbrack {
add_token(:T_RBRACK)
fnext main;
}
# Machine used for lexing the body of a CSS predicate.
predicate := |*
whitespace;
# Some of the operators have similar characters (e.g. the "="). As a # Some of the operators have similar characters (e.g. the "="). As a
# result we can't use rules like the following: # result we can't use rules like the following:
@ -259,26 +279,30 @@ module Oga
# #
# This would result in both machines being executed for the input # This would result in both machines being executed for the input
# "*=". The syntax below ensures that only the first match is handled. # "*=". The syntax below ensures that only the first match is handled.
op_eq => { add_token(:T_EQ) }; op_eq => { add_token(:T_EQ) };
op_space_in => { add_token(:T_SPACE_IN) }; op_space_in => { add_token(:T_SPACE_IN) };
op_starts_with => { add_token(:T_STARTS_WITH) }; op_starts_with => { add_token(:T_STARTS_WITH) };
op_ends_with => { add_token(:T_ENDS_WITH) }; op_ends_with => { add_token(:T_ENDS_WITH) };
op_in => { add_token(:T_IN) }; op_in => { add_token(:T_IN) };
op_hyphen_in => { add_token(:T_HYPHEN_IN) }; op_hyphen_in => { add_token(:T_HYPHEN_IN) };
identifier => emit_identifier;
rbrack => emit_rbrack;
string => emit_string;
*|;
main := |*
hash | dot | colon;
op_child => { add_token(:T_CHILD) }; op_child => { add_token(:T_CHILD) };
op_fol_direct => { add_token(:T_FOLLOWING_DIRECT) }; op_fol_direct => { add_token(:T_FOLLOWING_DIRECT) };
op_fol => { add_token(:T_FOLLOWING) }; op_fol => { add_token(:T_FOLLOWING) };
# The pipe character is also used in the |= operator so the action for lbrack => emit_lbrack;
# this is handled separately.
pipe => emit_pipe; pipe => emit_pipe;
comma => emit_comma; comma => emit_comma;
whitespace => emit_whitespace; whitespace => emit_whitespace;
lparen => emit_lparen; lparen => emit_lparen;
identifier => emit_identifier; identifier => emit_identifier;
integer => emit_integer;
string => emit_string;
any; any;
*|; *|;

View File

@ -1,9 +0,0 @@
require 'spec_helper'
describe Oga::CSS::Lexer do
context 'integers' do
example 'lex an integer' do
lex_css('10').should == [[:T_INT, 10]]
end
end
end

View File

@ -3,35 +3,60 @@ require 'spec_helper'
describe Oga::CSS::Lexer do describe Oga::CSS::Lexer do
context 'operators' do context 'operators' do
example 'lex the = operator' do example 'lex the = operator' do
lex_css('=').should == [[:T_EQ, nil]] lex_css('[=]').should == [
[:T_LBRACK, nil],
[:T_EQ, nil],
[:T_RBRACK, nil]
]
end end
example 'lex the ~= operator' do example 'lex the ~= operator' do
lex_css('~=').should == [[:T_SPACE_IN, nil]] lex_css('[~=]').should == [
[:T_LBRACK, nil],
[:T_SPACE_IN, nil],
[:T_RBRACK, nil]
]
end end
example 'lex the ^= operator' do example 'lex the ^= operator' do
lex_css('^=').should == [[:T_STARTS_WITH, nil]] lex_css('[^=]').should == [
[:T_LBRACK, nil],
[:T_STARTS_WITH, nil],
[:T_RBRACK, nil]
]
end end
example 'lex the $= operator' do example 'lex the $= operator' do
lex_css('$=').should == [[:T_ENDS_WITH, nil]] lex_css('[$=]').should == [
[:T_LBRACK, nil],
[:T_ENDS_WITH, nil],
[:T_RBRACK, nil],
]
end end
example 'lex the *= operator' do example 'lex the *= operator' do
lex_css('*=').should == [[:T_IN, nil]] lex_css('[*=]').should == [
[:T_LBRACK, nil],
[:T_IN, nil],
[:T_RBRACK, nil]
]
end end
example 'lex an identifier followed by the *= operator' do example 'lex an identifier followed by the *= operator' do
lex_css('foo *=').should == [ lex_css('[foo *=]').should == [
[:T_LBRACK, nil],
[:T_IDENT, 'foo'], [:T_IDENT, 'foo'],
[:T_SPACE, nil], [:T_IN, nil],
[:T_IN, nil] [:T_RBRACK, nil]
] ]
end end
example 'lex the |= operator' do example 'lex the |= operator' do
lex_css('|=').should == [[:T_HYPHEN_IN, nil]] lex_css('[|=]').should == [
[:T_LBRACK, nil],
[:T_HYPHEN_IN, nil],
[:T_RBRACK, nil]
]
end end
end end
end end

View File

@ -3,11 +3,19 @@ require 'spec_helper'
describe Oga::CSS::Lexer do describe Oga::CSS::Lexer do
context 'strings' do context 'strings' do
example 'lex a single quoted string' do example 'lex a single quoted string' do
lex_css("'foo'").should == [[:T_STRING, 'foo']] lex_css("['foo']").should == [
[:T_LBRACK, nil],
[:T_STRING, 'foo'],
[:T_RBRACK, nil]
]
end end
example 'lex a double quoted string' do example 'lex a double quoted string' do
lex_css('"foo"').should == [[:T_STRING, 'foo']] lex_css('["foo"]').should == [
[:T_LBRACK, nil],
[:T_STRING, 'foo'],
[:T_RBRACK, nil]
]
end end
end end
end end