From aa60115c0a5711bc323d8d16f3214f2acc342f96 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Tue, 16 Sep 2014 16:32:57 +0200 Subject: [PATCH] Basic boilerplate for lexing CSS selectors. --- .gitignore | 1 + lib/oga.rb | 2 + lib/oga/css/lexer.rl | 159 +++++++++++++++++++++++++++++++ spec/oga/css/lexer/paths_spec.rb | 17 ++++ spec/support/parsing.rb | 10 ++ task/lexer.rake | 3 +- 6 files changed, 191 insertions(+), 1 deletion(-) create mode 100644 lib/oga/css/lexer.rl create mode 100644 spec/oga/css/lexer/paths_spec.rb diff --git a/.gitignore b/.gitignore index 97d57e8..d14b3c1 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ Gemfile.lock lib/oga/xml/parser.rb lib/oga/xpath/lexer.rb lib/oga/xpath/parser.rb +lib/oga/css/lexer.rb lib/liboga.* diff --git a/lib/oga.rb b/lib/oga.rb index e0804f3..382561a 100644 --- a/lib/oga.rb +++ b/lib/oga.rb @@ -48,3 +48,5 @@ require_relative 'oga/xpath/node' require_relative 'oga/xpath/lexer' require_relative 'oga/xpath/parser' require_relative 'oga/xpath/evaluator' + +require_relative 'oga/css/lexer' diff --git a/lib/oga/css/lexer.rl b/lib/oga/css/lexer.rl new file mode 100644 index 0000000..b9fb553 --- /dev/null +++ b/lib/oga/css/lexer.rl @@ -0,0 +1,159 @@ +%%machine css_lexer; # % + +module Oga + module CSS + ## + # Lexer for turning CSS expressions into a sequence of tokens. Tokens are + # returned as arrays with every array having two values: + # + # 1. The token type as a Symbol + # 2. The token value, or nil if there is no value. + # + # ## Thread Safety + # + # Similar to the XPath lexer this lexer keeps track of an internal state. As + # a result it's not safe to share the same instance of this lexer between + # multiple threads. However, no global state is used so you can use separate + # instances in threads just fine. + # + class Lexer + %% write data; + + # % fix highlight + + ## + # @param [String] data The data to lex. + # + def initialize(data) + @data = data + end + + ## + # Gathers all the tokens for the input and returns them as an Array. + # + # @see [#advance] + # @return [Array] + # + def lex + tokens = [] + + advance do |type, value| + tokens << [type, value] + end + + return tokens + end + + ## + # Advances through the input and generates the corresponding tokens. Each + # token is yielded to the supplied block. + # + # This method stores the supplied block in `@block` and resets it after + # the lexer loop has finished. + # + # @see [#add_token] + # + def advance(&block) + @block = block + + data = @data # saves ivar lookups while lexing. + ts = nil + te = nil + stack = [] + top = 0 + cs = self.class.css_lexer_start + act = 0 + eof = @data.bytesize + p = 0 + pe = eof + + _css_lexer_eof_trans = self.class.send(:_css_lexer_eof_trans) + _css_lexer_from_state_actions = self.class.send(:_css_lexer_from_state_actions) + _css_lexer_index_offsets = self.class.send(:_css_lexer_index_offsets) + _css_lexer_indicies = self.class.send(:_css_lexer_indicies) + _css_lexer_key_spans = self.class.send(:_css_lexer_key_spans) + _css_lexer_to_state_actions = self.class.send(:_css_lexer_to_state_actions) + _css_lexer_trans_actions = self.class.send(:_css_lexer_trans_actions) + _css_lexer_trans_keys = self.class.send(:_css_lexer_trans_keys) + _css_lexer_trans_targs = self.class.send(:_css_lexer_trans_targs) + + %% write exec; + + # % fix highlight + ensure + @block = nil + end + + private + + ## + # Emits a token of which the value is based on the supplied start/stop + # position. + # + # @param [Symbol] type The token type. + # @param [Fixnum] start + # @param [Fixnum] stop + # + # @see [#text] + # @see [#add_token] + # + def emit(type, start, stop) + value = slice_input(start, stop) + + add_token(type, value) + end + + ## + # Returns the text between the specified start and stop position. + # + # @param [Fixnum] start + # @param [Fixnum] stop + # @return [String] + # + def slice_input(start, stop) + return @data.byteslice(start, stop - start) + end + + ## + # Yields a new token to the supplied block. + # + # @param [Symbol] type The token type. + # @param [String] value The token value. + # + # @yieldparam [Symbol] type + # @yieldparam [String|NilClass] value + # + def add_token(type, value = nil) + @block.call(type, value) + end + + %%{ + getkey (data.getbyte(p) || 0); + + whitespace = [\t ]+; + + action emit_space { + add_token(:T_SPACE) + } + + # Identifiers + # + # Identifiers are used for element and attribute names. Identifiers have + # to start with a letter. + + identifier = [a-zA-Z*]+ [a-zA-Z\-_0-9]*; + + action emit_identifier { + emit(:T_IDENT, ts, te) + } + + main := |* + identifier => emit_identifier; + whitespace => emit_space; + + any; + *|; + }%% + end # Lexer + end # CSS +end # Oga diff --git a/spec/oga/css/lexer/paths_spec.rb b/spec/oga/css/lexer/paths_spec.rb new file mode 100644 index 0000000..ad71feb --- /dev/null +++ b/spec/oga/css/lexer/paths_spec.rb @@ -0,0 +1,17 @@ +require 'spec_helper' + +describe Oga::CSS::Lexer do + context 'paths' do + example 'lex a simple path' do + lex_css('h3').should == [[:T_IDENT, 'h3']] + end + + example 'lex a path with two members' do + lex_css('div h3').should == [ + [:T_IDENT, 'div'], + [:T_SPACE, nil], + [:T_IDENT, 'h3'] + ] + end + end +end diff --git a/spec/support/parsing.rb b/spec/support/parsing.rb index c17e77c..ec59a3f 100644 --- a/spec/support/parsing.rb +++ b/spec/support/parsing.rb @@ -33,6 +33,16 @@ module Oga return Oga::XPath::Lexer.new(input).lex end + ## + # Lexes a CSS expression. + # + # @param [String] input + # @return [Array] + # + def lex_css(input) + return Oga::CSS::Lexer.new(input).lex + end + ## # Parses an XPath expression. # diff --git a/task/lexer.rake b/task/lexer.rake index e4bc98f..b6b37e2 100644 --- a/task/lexer.rake +++ b/task/lexer.rake @@ -28,5 +28,6 @@ desc 'Generates the lexers' multitask :lexer => [ 'ext/c/lexer.c', 'ext/java/org/liboga/xml/Lexer.java', - 'lib/oga/xpath/lexer.rb' + 'lib/oga/xpath/lexer.rb', + 'lib/oga/css/lexer.rb' ]