diff --git a/.editorconfig b/.editorconfig index eac453c..ca5aea5 100644 --- a/.editorconfig +++ b/.editorconfig @@ -7,3 +7,6 @@ trim_trailing_whitespace = true [*.{y,rb,rl}] indent_size = 2 + +[*.{h,h},ext/oga/xml/*.rl] +indent_size = 2 diff --git a/.gitignore b/.gitignore index a165c71..e7440d6 100644 --- a/.gitignore +++ b/.gitignore @@ -3,10 +3,14 @@ coverage pkg Gemfile.lock -lib/oga/xml/lexer.rb lib/oga/xml/parser.rb +lib/liboga.* benchmark/fixtures/big.xml profile/samples/*.txt profile/samples/*/*.txt +*.so +tmp + +ext/liboga/lexer.c diff --git a/MANIFEST b/MANIFEST index c862606..5b2c86a 100644 --- a/MANIFEST +++ b/MANIFEST @@ -5,6 +5,14 @@ README.md doc/DCO.md doc/changelog.md doc/css/common.css +ext/liboga/extconf.rb +ext/liboga/lexer.c +ext/liboga/lexer.h +ext/liboga/lexer.rl +ext/liboga/liboga.c +ext/liboga/liboga.h +ext/liboga/xml.c +ext/liboga/xml.h lib/oga.rb lib/oga/html/parser.rb lib/oga/version.rb @@ -14,10 +22,8 @@ lib/oga/xml/doctype.rb lib/oga/xml/document.rb lib/oga/xml/element.rb lib/oga/xml/lexer.rb -lib/oga/xml/lexer.rl lib/oga/xml/node.rb lib/oga/xml/parser.rb -lib/oga/xml/parser.y lib/oga/xml/pull_parser.rb lib/oga/xml/text.rb lib/oga/xml/xml_declaration.rb diff --git a/Rakefile b/Rakefile index da33ccb..a25c691 100644 --- a/Rakefile +++ b/Rakefile @@ -1,33 +1,38 @@ require 'bundler/gem_tasks' require 'digest/sha2' require 'rake/clean' +require 'rake/extensiontask' require 'cliver' GEMSPEC = Gem::Specification.load('oga.gemspec') -LEXER_OUTPUT = 'lib/oga/xml/lexer.rb' PARSER_OUTPUT = 'lib/oga/xml/parser.rb' CLEAN.include( 'coverage', 'yardoc', - LEXER_OUTPUT, PARSER_OUTPUT, 'benchmark/fixtures/big.xml', - 'profile/samples/**/*.txt' + 'profile/samples/**/*.txt', + 'lib/liboga.*', + 'tmp', + 'ext/liboga/lexer.c' ) FILE_LIST = FileList.new( 'checkum/**/*.*', 'doc/**/*.*', - 'lib/**/*.*', + 'lib/**/*.rb', 'LICENSE', 'MANIFEST', '*.gemspec', 'README.md', - '.yardopts' + '.yardopts', + 'ext/**/*.*' ) +Rake::ExtensionTask.new('liboga', GEMSPEC) + Dir['./task/*.rake'].each do |task| import(task) end diff --git a/ext/liboga/extconf.rb b/ext/liboga/extconf.rb new file mode 100644 index 0000000..469c1a8 --- /dev/null +++ b/ext/liboga/extconf.rb @@ -0,0 +1,13 @@ +require 'mkmf' + +have_header('ruby.h') + +$CFLAGS << ' -Wextra -Wall -pedantic' + +if ENV['DEBUG'] + $CFLAGS << ' -O0' +else + $CFLAGS << ' -O3 -g' +end + +create_makefile('liboga/liboga') diff --git a/ext/liboga/lexer.h b/ext/liboga/lexer.h new file mode 100644 index 0000000..9da1c67 --- /dev/null +++ b/ext/liboga/lexer.h @@ -0,0 +1,10 @@ +#include "liboga.h" + +#ifndef LIBOGA_XML_LEXER_H +#define LIBOGA_XML_LEXER_H + +extern VALUE oga_cLexer; + +extern void Init_liboga_xml_lexer(); + +#endif diff --git a/ext/liboga/lexer.rl b/ext/liboga/lexer.rl new file mode 100644 index 0000000..01b2f82 --- /dev/null +++ b/ext/liboga/lexer.rl @@ -0,0 +1,298 @@ +#include "lexer.h" + +VALUE oga_cLexer; + +%%machine lexer; + +void oga_xml_lexer_callback( + VALUE self, + const char *name, + rb_encoding *encoding, + const char *ts, + const char *te +) +{ + int length = te - ts; + VALUE value = rb_enc_str_new_cstr(strndup(ts, length), encoding); + VALUE method = rb_intern(name); + + rb_funcall(self, method, 1, value); +} + +void oga_xml_lexer_callback_simple(VALUE self, const char *name) +{ + VALUE method = rb_intern(name); + + rb_funcall(self, method, 0); +} + +%% write data; + +VALUE oga_xml_lexer_advance(VALUE self) +{ + /* Pull the data in from Ruby land. */ + VALUE data_ivar = rb_ivar_get(self, rb_intern("@data")); + + /* Make sure that all data passed back to Ruby has the proper encoding. */ + rb_encoding *encoding = rb_enc_get(data_ivar); + + char *data_str_val = StringValuePtr(data_ivar); + + const char *p = data_str_val; + const char *pe = data_str_val + strlen(data_str_val); + const char *eof = pe; + const char *ts, *te; + + int act = 0; + int cs = 0; + int top = 0; + int stack[8]; + + %% write init; + %% write exec; + + return Qnil; +} + +%%{ + newline = '\n' | '\r\n'; + whitespace = [ \t]; + identifier = [a-zA-Z0-9\-_:]+; + + # Strings + # + # Strings in HTML can either be single or double quoted. If a string + # starts with one of these quotes it must be closed with the same type + # of quote. + dquote = '"'; + squote = "'"; + + # Machine for processing double quoted strings. + string_dquote := |* + ^dquote+ => { + oga_xml_lexer_callback(self, "on_string", encoding, ts, te); + }; + + dquote => { fret; }; + *|; + + # Machine for processing single quoted strings. + string_squote := |* + ^squote+ => { + oga_xml_lexer_callback(self, "on_string", encoding, ts, te); + }; + + squote => { fret; }; + *|; + + # DOCTYPES + # + # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax + # + # These rules support the 3 flavours of doctypes: + # + # 1. Normal doctypes, as introduced in the HTML5 specification. + # 2. Deprecated doctypes, the more verbose ones used prior to HTML5. + # 3. Legacy doctypes + # + doctype_start = ' { + oga_xml_lexer_callback(self, "on_doctype_type", encoding, ts, te); + }; + + # Lex the public/system IDs as regular strings. + dquote => { fcall string_dquote; }; + squote => { fcall string_squote; }; + + # Whitespace inside doctypes is ignored since there's no point in + # including it. + whitespace; + + identifier => { + oga_xml_lexer_callback(self, "on_doctype_name", encoding, ts, te); + }; + + '>' => { + oga_xml_lexer_callback_simple(self, "on_doctype_end"); + fret; + }; + *|; + + # CDATA + # + # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections + # + # CDATA tags are broken up into 3 parts: the start, the content and the + # end tag. + # + # In HTML CDATA tags have no meaning/are not supported. Oga does + # support them but treats their contents as plain text. + # + cdata_start = ''; + + action start_cdata { + oga_xml_lexer_callback_simple(self, "on_cdata_start"); + fcall cdata; + } + + # Machine that for processing the contents of CDATA tags. Everything + # inside a CDATA tag is treated as plain text. + cdata := |* + any* cdata_end => { + oga_xml_lexer_callback(self, "on_text", encoding, ts, te - 3); + oga_xml_lexer_callback_simple(self, "on_cdata_end"); + fret; + }; + *|; + + # Comments + # + # http://www.w3.org/TR/html-markup/syntax.html#comments + # + # Comments are lexed into 3 parts: the start tag, the content and the + # end tag. + # + # Unlike the W3 specification these rules *do* allow character + # sequences such as `--` and `->`. Putting extra checks in for these + # sequences would actually make the rules/actions more complex. + # + comment_start = ''; + + action start_comment { + oga_xml_lexer_callback_simple(self, "on_comment_start"); + fcall comment; + } + + # Machine used for processing the contents of a comment. Everything + # inside a comment is treated as plain text (similar to CDATA tags). + comment := |* + any* comment_end => { + oga_xml_lexer_callback(self, "on_text", encoding, ts, te - 3); + oga_xml_lexer_callback_simple(self, "on_comment_end"); + fret; + }; + *|; + + # XML declaration tags + # + # http://www.w3.org/TR/REC-xml/#sec-prolog-dtd + # + xml_decl_start = ''; + + action start_xml_decl { + oga_xml_lexer_callback_simple(self, "on_xml_decl_start"); + fcall xml_decl; + } + + # Machine that processes the contents of an XML declaration tag. + xml_decl := |* + xml_decl_end => { + oga_xml_lexer_callback_simple(self, "on_xml_decl_end"); + fret; + }; + + # Attributes and their values (e.g. version="1.0"). + identifier => { + oga_xml_lexer_callback(self, "on_attribute", encoding, ts, te); + }; + + dquote => { fcall string_dquote; }; + squote => { fcall string_squote; }; + + any; + *|; + + # Elements + # + # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements + # + + # Action that creates the tokens for the opening tag, name and + # namespace (if any). Remaining work is delegated to a dedicated + # machine. + action start_element { + oga_xml_lexer_callback(self, "on_element_start", encoding, ts + 1, te); + + fcall element_head; + } + + element_start = '<' identifier; + + # Machine used for processing the characters inside a element head. An + # element head is everything between ``. + # + # For example, in `

` the element head is ` foo="bar"`. + # + element_head := |* + whitespace | '='; + + newline => { + oga_xml_lexer_callback_simple(self, "on_newline"); + }; + + # Attribute names. + identifier => { + oga_xml_lexer_callback(self, "on_attribute", encoding, ts, te); + }; + + # Attribute values. + dquote => { fcall string_dquote; }; + squote => { fcall string_squote; }; + + # The closing character of the open tag. + ('>' | '/') => { + fhold; + fret; + }; + *|; + + main := |* + element_start => start_element; + doctype_start => start_doctype; + cdata_start => start_cdata; + comment_start => start_comment; + xml_decl_start => start_xml_decl; + + # Enter the body of the tag. If HTML mode is enabled and the current + # element is a void element we'll close it and bail out. + '>' => { + oga_xml_lexer_callback_simple(self, "on_element_open_end"); + }; + + # Regular closing tags. + '' => { + oga_xml_lexer_callback_simple(self, "on_element_end"); + }; + + # Self closing elements that are not handled by the HTML mode. + '/>' => { + oga_xml_lexer_callback_simple(self, "on_element_end"); + }; + + # Note that this rule should be declared at the very bottom as it + # will otherwise take precedence over the other rules. + ^('<' | '>')+ => { + oga_xml_lexer_callback(self, "on_text", encoding, ts, te); + }; + *|; +}%% + +void Init_liboga_xml_lexer() +{ + oga_cLexer = rb_define_class_under(oga_mXML, "Lexer", rb_cObject); + + rb_define_method(oga_cLexer, "advance_native", oga_xml_lexer_advance, 0); +} diff --git a/ext/liboga/liboga.c b/ext/liboga/liboga.c new file mode 100644 index 0000000..c49d6a1 --- /dev/null +++ b/ext/liboga/liboga.c @@ -0,0 +1,11 @@ +#include "liboga.h" + +VALUE oga_mOga; + +void Init_liboga() +{ + oga_mOga = rb_define_module("Oga"); + + Init_liboga_xml(); + Init_liboga_xml_lexer(); +} diff --git a/ext/liboga/liboga.h b/ext/liboga/liboga.h new file mode 100644 index 0000000..6f3c0af --- /dev/null +++ b/ext/liboga/liboga.h @@ -0,0 +1,17 @@ +#ifndef LIBOGA_H +#define LIBOGA_H + +#include +#include +#include +#include +#include + +extern VALUE oga_mOga; + +#include "xml.h" +#include "lexer.h" + +void Init_liboga(); + +#endif diff --git a/ext/liboga/xml.c b/ext/liboga/xml.c new file mode 100644 index 0000000..63ef162 --- /dev/null +++ b/ext/liboga/xml.c @@ -0,0 +1,8 @@ +#include "xml.h" + +VALUE oga_mXML; + +void Init_liboga_xml() +{ + oga_mXML = rb_define_module_under(oga_mOga, "XML"); +} diff --git a/ext/liboga/xml.h b/ext/liboga/xml.h new file mode 100644 index 0000000..672a579 --- /dev/null +++ b/ext/liboga/xml.h @@ -0,0 +1,10 @@ +#ifndef LIBOGA_XML_H +#define LIBOGA_XML_H + +#include "liboga.h" + +extern VALUE oga_mXML; + +void Init_liboga_xml(); + +#endif diff --git a/lib/oga.rb b/lib/oga.rb index 19ca643..1a74658 100644 --- a/lib/oga.rb +++ b/lib/oga.rb @@ -1,5 +1,7 @@ require 'set' +require_relative 'liboga' + require_relative 'oga/xml/lexer' require_relative 'oga/xml/parser' require_relative 'oga/xml/pull_parser' diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb new file mode 100644 index 0000000..f5dd157 --- /dev/null +++ b/lib/oga/xml/lexer.rb @@ -0,0 +1,249 @@ +module Oga + module XML + ## + # Low level lexer that supports both XML and HTML (using an extra option). + # To lex HTML input set the `:html` option to `true` when creating an + # instance of the lexer: + # + # lexer = Oga::XML::Lexer.new(:html => true) + # + # @!attribute [r] html + # @return [TrueClass|FalseClass] + # + # @!attribute [r] tokens + # @return [Array] + # + class Lexer + attr_reader :html + + ## + # Names of the HTML void elements that should be handled when HTML lexing + # is enabled. + # + # @return [Set] + # + HTML_VOID_ELEMENTS = Set.new([ + 'area', + 'base', + 'br', + 'col', + 'command', + 'embed', + 'hr', + 'img', + 'input', + 'keygen', + 'link', + 'meta', + 'param', + 'source', + 'track', + 'wbr' + ]) + + ## + # @param [String] data The data to lex. + # + # @param [Hash] options + # + # @option options [Symbol] :html When set to `true` the lexer will treat + # the input as HTML instead of SGML/XML. This makes it possible to lex + # HTML void elements such as ``. + # + def initialize(data, options = {}) + options.each do |key, value| + instance_variable_set("@#{key}", value) if respond_to?(key) + end + + @data = data + + reset + end + + ## + # Resets the internal state of the lexer. Typically you don't need to + # call this method yourself as its called by #lex after lexing a given + # String. + # + def reset + @line = 1 + @elements = [] + end + + ## + # Gathers all the tokens for the input and returns them as an Array. + # + # This method resets the internal state of the lexer after consuming the + # input. + # + # @param [String] data The string to consume. + # @return [Array] + # @see #advance + # + def lex + tokens = [] + + advance do |token| + tokens << token + end + + reset + + return tokens + end + + ## + # Advances through the input and generates the corresponding tokens. Each + # token is yielded to the supplied block. + # + # Each token is an Array in the following format: + # + # [TYPE, VALUE] + # + # The type is a symbol, the value is either nil or a String. + # + # This method stores the supplied block in `@block` and resets it after + # the lexer loop has finished. + # + # This method does *not* reset the internal state of the lexer. + # + # + # @param [String] data The String to consume. + # @return [Array] + # + def advance(&block) + @block = block + + advance_native + ensure + @block = nil + end + + ## + # @return [TrueClass|FalseClass] + # + def html? + return !!html + end + + private + + ## + # @param [Fixnum] amount The amount of lines to advance. + # + def advance_line(amount = 1) + @line += amount + end + + ## + # Adds a token with the given type and value to the list. + # + # @param [Symbol] type The token type. + # @param [String] value The token value. + # + def add_token(type, value = nil) + token = [type, value, @line] + + @block.call(token) + end + + ## + # Returns the name of the element we're currently in. + # + # @return [String] + # + def current_element + return @elements.last + end + + def on_string(value) + add_token(:T_STRING, value) + end + + def on_start_doctype + add_token(:T_DOCTYPE_START) + end + + def on_doctype_type(value) + add_token(:T_DOCTYPE_TYPE, value) + end + + def on_doctype_name(value) + add_token(:T_DOCTYPE_NAME, value) + end + + def on_doctype_end + add_token(:T_DOCTYPE_END) + end + + def on_cdata_start + add_token(:T_CDATA_START) + end + + def on_cdata_end + add_token(:T_CDATA_END) + end + + def on_comment_start + add_token(:T_COMMENT_START) + end + + def on_comment_end + add_token(:T_COMMENT_END) + end + + def on_xml_decl_start + add_token(:T_XML_DECL_START) + end + + def on_xml_decl_end + add_token(:T_XML_DECL_END) + end + + def on_element_start(name) + add_token(:T_ELEM_START) + + if name.include?(':') + ns, name = name.split(':') + + add_token(:T_ELEM_NS, ns) + end + + @elements << name if html? + + add_token(:T_ELEM_NAME, name) + end + + def on_element_open_end + if html? and HTML_VOID_ELEMENTS.include?(current_element) + add_token(:T_ELEM_END) + @elements.pop + end + end + + def on_element_end + add_token(:T_ELEM_END) + + @elements.pop if html? + end + + def on_text(value) + unless value.empty? + add_token(:T_TEXT, value) + + lines = value.count("\n") + + advance_line(lines) if lines > 0 + end + end + + def on_attribute(value) + add_token(:T_ATTR, value) + end + + def on_newline + @line += 1 + end + end # Lexer + end # XML +end # Oga diff --git a/lib/oga/xml/lexer.rl b/lib/oga/xml/lexer.rl deleted file mode 100644 index 1e8ea96..0000000 --- a/lib/oga/xml/lexer.rl +++ /dev/null @@ -1,501 +0,0 @@ -%%machine lexer; # % - -module Oga - module XML - ## - # Low level lexer that supports both XML and HTML (using an extra option). - # To lex HTML input set the `:html` option to `true` when creating an - # instance of the lexer: - # - # lexer = Oga::XML::Lexer.new(:html => true) - # - # @!attribute [r] html - # @return [TrueClass|FalseClass] - # - # @!attribute [r] tokens - # @return [Array] - # - class Lexer - %% write data; - - # % fix highlight - - attr_reader :html - - ## - # Names of the HTML void elements that should be handled when HTML lexing - # is enabled. - # - # @return [Set] - # - HTML_VOID_ELEMENTS = Set.new([ - 'area', - 'base', - 'br', - 'col', - 'command', - 'embed', - 'hr', - 'img', - 'input', - 'keygen', - 'link', - 'meta', - 'param', - 'source', - 'track', - 'wbr' - ]) - - ## - # @param [String] data The data to lex. - # - # @param [Hash] options - # - # @option options [Symbol] :html When set to `true` the lexer will treat - # the input as HTML instead of SGML/XML. This makes it possible to lex - # HTML void elements such as ``. - # - def initialize(data, options = {}) - options.each do |key, value| - instance_variable_set("@#{key}", value) if respond_to?(key) - end - - @data = data - - reset - end - - ## - # Resets the internal state of the lexer. Typically you don't need to - # call this method yourself as its called by #lex after lexing a given - # String. - # - def reset - @line = 1 - @elements = [] - - @buffer_start_position = nil - end - - ## - # Gathers all the tokens for the input and returns them as an Array. - # - # This method resets the internal state of the lexer after consuming the - # input. - # - # @param [String] data The string to consume. - # @return [Array] - # @see #advance - # - def lex - tokens = [] - - advance do |token| - tokens << token - end - - reset - - return tokens - end - - ## - # Advances through the input and generates the corresponding tokens. Each - # token is yielded to the supplied block. - # - # Each token is an Array in the following format: - # - # [TYPE, VALUE] - # - # The type is a symbol, the value is either nil or a String. - # - # This method stores the supplied block in `@block` and resets it after - # the lexer loop has finished. - # - # This method does *not* reset the internal state of the lexer. - # - # - # @param [String] data The String to consume. - # @return [Array] - # - def advance(&block) - @block = block - - data = @data - ts = nil - te = nil - stack = [] - top = 0 - cs = self.class.lexer_start - act = 0 - eof = @data.bytesize - p = 0 - pe = eof - - _lexer_eof_trans = self.class.send(:_lexer_eof_trans) - _lexer_from_state_actions = self.class.send(:_lexer_from_state_actions) - _lexer_index_offsets = self.class.send(:_lexer_index_offsets) - _lexer_indicies = self.class.send(:_lexer_indicies) - _lexer_key_spans = self.class.send(:_lexer_key_spans) - _lexer_to_state_actions = self.class.send(:_lexer_to_state_actions) - _lexer_trans_actions = self.class.send(:_lexer_trans_actions) - _lexer_trans_keys = self.class.send(:_lexer_trans_keys) - _lexer_trans_targs = self.class.send(:_lexer_trans_targs) - - %% write exec; - - # % fix highlight - ensure - @block = nil - end - - ## - # @return [TrueClass|FalseClass] - # - def html? - return !!html - end - - private - - ## - # @param [Fixnum] amount The amount of lines to advance. - # - def advance_line(amount = 1) - @line += amount - end - - ## - # Emits a token who's value is based on the supplied start/stop position. - # - # @param [Symbol] type The token type. - # @param [Fixnum] start - # @param [Fixnum] stop - # - # @see #text - # @see #add_token - # - def emit(type, start, stop) - value = text(start, stop) - - add_token(type, value) - end - - ## - # Returns the text of the current buffer based on the supplied start and - # stop position. - # - # @param [Fixnum] start - # @param [Fixnum] stop - # @return [String] - # - def text(start, stop) - return @data.byteslice(start, stop - start) - end - - ## - # Adds a token with the given type and value to the list. - # - # @param [Symbol] type The token type. - # @param [String] value The token value. - # - def add_token(type, value = nil) - token = [type, value, @line] - - @block.call(token) - end - - ## - # Enables buffering starting at the given position. - # - # @param [Fixnum] position The start position of the buffer. - # - def start_buffer(position) - @buffer_start_position = position - end - - ## - # Emits a text token. - # - # @param [Fixnum] start - # @param [Fixnum] stop - # - def emit_text(start, stop) - content = text(start, stop) - - unless content.empty? - add_token(:T_TEXT, content) - - lines = content.count("\n") - - advance_line(lines) if lines > 0 - end - end - - ## - # Returns the name of the element we're currently in. - # - # @return [String] - # - def current_element - return @elements.last - end - - %%{ - getkey (data.getbyte(p) || 0); - - newline = '\n' | '\r\n'; - whitespace = [ \t]; - identifier = [a-zA-Z0-9\-_:]+; - - # Strings - # - # Strings in HTML can either be single or double quoted. If a string - # starts with one of these quotes it must be closed with the same type - # of quote. - dquote = '"'; - squote = "'"; - - # Machine for processing double quoted strings. - string_dquote := |* - ^dquote+ => { - emit(:T_STRING, ts, te) - }; - - dquote => { fret; }; - *|; - - # Machine for processing single quoted strings. - string_squote := |* - ^squote+ => { - emit(:T_STRING, ts, te) - }; - - squote => { fret; }; - *|; - - # DOCTYPES - # - # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax - # - # These rules support the 3 flavours of doctypes: - # - # 1. Normal doctypes, as introduced in the HTML5 specification. - # 2. Deprecated doctypes, the more verbose ones used prior to HTML5. - # 3. Legacy doctypes - # - doctype_start = ' { emit(:T_DOCTYPE_TYPE, ts, te) }; - - # Lex the public/system IDs as regular strings. - dquote => { fcall string_dquote; }; - squote => { fcall string_squote; }; - - # Whitespace inside doctypes is ignored since there's no point in - # including it. - whitespace; - - identifier => { emit(:T_DOCTYPE_NAME, ts, te) }; - - '>' => { - add_token(:T_DOCTYPE_END) - fret; - }; - *|; - - # CDATA - # - # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections - # - # CDATA tags are broken up into 3 parts: the start, the content and the - # end tag. - # - # In HTML CDATA tags have no meaning/are not supported. Oga does - # support them but treats their contents as plain text. - # - cdata_start = ''; - - action start_cdata { - add_token(:T_CDATA_START) - - fcall cdata; - } - - # Machine that for processing the contents of CDATA tags. Everything - # inside a CDATA tag is treated as plain text. - cdata := |* - any* cdata_end => { - emit_text(ts, te - 3) - add_token(:T_CDATA_END) - - fret; - }; - *|; - - # Comments - # - # http://www.w3.org/TR/html-markup/syntax.html#comments - # - # Comments are lexed into 3 parts: the start tag, the content and the - # end tag. - # - # Unlike the W3 specification these rules *do* allow character - # sequences such as `--` and `->`. Putting extra checks in for these - # sequences would actually make the rules/actions more complex. - # - comment_start = ''; - - action start_comment { - add_token(:T_COMMENT_START) - - fcall comment; - } - - # Machine used for processing the contents of a comment. Everything - # inside a comment is treated as plain text (similar to CDATA tags). - comment := |* - any* comment_end => { - emit_text(ts, te - 3) - add_token(:T_COMMENT_END) - - fret; - }; - *|; - - # XML declaration tags - # - # http://www.w3.org/TR/REC-xml/#sec-prolog-dtd - # - xml_decl_start = ''; - - action start_xml_decl { - add_token(:T_XML_DECL_START) - - fcall xml_decl; - } - - # Machine that processes the contents of an XML declaration tag. - xml_decl := |* - xml_decl_end => { - add_token(:T_XML_DECL_END) - - fret; - }; - - # Attributes and their values (e.g. version="1.0"). - identifier => { emit(:T_ATTR, ts, te) }; - - dquote => { fcall string_dquote; }; - squote => { fcall string_squote; }; - - any; - *|; - - # Elements - # - # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements - # - - # Action that creates the tokens for the opening tag, name and - # namespace (if any). Remaining work is delegated to a dedicated - # machine. - action start_element { - add_token(:T_ELEM_START) - - # Add the element name. If the name includes a namespace we'll break - # the name up into two separate tokens. - name = text(ts + 1, te) - - if name.include?(':') - ns, name = name.split(':') - - add_token(:T_ELEM_NS, ns) - end - - @elements << name if html? - - add_token(:T_ELEM_NAME, name) - - fcall element_head; - } - - element_start = '<' identifier; - - # Machine used for processing the characters inside a element head. An - # element head is everything between ``. - # - # For example, in `

` the element head is ` foo="bar"`. - # - element_head := |* - whitespace | '='; - - newline => { advance_line }; - - # Attribute names. - identifier => { emit(:T_ATTR, ts, te) }; - - # Attribute values. - dquote => { fcall string_dquote; }; - squote => { fcall string_squote; }; - - # The closing character of the open tag. - ('>' | '/') => { - fhold; - fret; - }; - *|; - - main := |* - element_start => start_element; - doctype_start => start_doctype; - cdata_start => start_cdata; - comment_start => start_comment; - xml_decl_start => start_xml_decl; - - # Enter the body of the tag. If HTML mode is enabled and the current - # element is a void element we'll close it and bail out. - '>' => { - if html? and HTML_VOID_ELEMENTS.include?(current_element) - add_token(:T_ELEM_END, nil) - @elements.pop - end - }; - - # Regular closing tags. - '' => { - add_token(:T_ELEM_END, nil) - - @elements.pop if html? - }; - - # Self closing elements that are not handled by the HTML mode. - '/>' => { - add_token(:T_ELEM_END, nil) - - @elements.pop if html? - }; - - # Note that this rule should be declared at the very bottom as it - # will otherwise take precedence over the other rules. - ^('<' | '>')+ => { - emit_text(ts, te) - }; - *|; - }%% - end # Lexer - end # XML -end # Oga diff --git a/oga.gemspec b/oga.gemspec index 76b027c..40407f4 100644 --- a/oga.gemspec +++ b/oga.gemspec @@ -12,6 +12,8 @@ Gem::Specification.new do |s| s.files = File.read(File.expand_path('../MANIFEST', __FILE__)).split("\n") + s.extensions = ['ext/liboga/extconf.rb'] + s.has_rdoc = 'yard' s.required_ruby_version = '>= 1.9.3' @@ -24,4 +26,5 @@ Gem::Specification.new do |s| s.add_development_dependency 'simplecov' s.add_development_dependency 'kramdown' s.add_development_dependency 'benchmark-ips' + s.add_development_dependency 'rake-compiler' end diff --git a/task/lexer.rake b/task/lexer.rake index e1b58f8..8823bea 100644 --- a/task/lexer.rake +++ b/task/lexer.rake @@ -18,5 +18,11 @@ rule '.rb' => '.rl' do |task| end end -desc 'Generates the lexer' -task :lexer => [LEXER_OUTPUT] +rule '.c' => '.rl' do |task| + Cliver.assert('ragel', '~> 6.7') + + sh "ragel -C -G2 #{task.source} -o #{task.name}" +end + +desc 'Generates the lexers' +task :lexer => ['ext/liboga/lexer.c'] diff --git a/task/test.rake b/task/test.rake index 1f6c0f2..d19ce6c 100644 --- a/task/test.rake +++ b/task/test.rake @@ -1,4 +1,4 @@ desc 'Runs the tests' -task :test => [:generate] do +task :test => [:generate, :compile] do sh 'rspec spec' end