Initial setup using a C extension.

While I've tried to keep Oga pure Ruby for as long as possible the performance
of Ragel's Ruby output was not worth the trouble. For example, lexing 10MB of
XML would take 5 to 6 seconds at least. Nokogiri on the other hand can parse
that same XML into a DOM document in about 300 miliseconds. Such a big
performance difference is not acceptable.

To work around this the XML/HTML lexer will be implemented in C for
MRI/Rubinius and Java for JRuby. For now there's only a C extension as I
haven't read up yet on the JRuby API. The end goal is to provide some sort of
Ragel "template" that can be used to generate the corresponding C/Java
extension code. This would remove the need of duplicating the grammar and
associated code.

The native extension setup is a hybrid between native and Ruby. The raw Ragel
stuff happens in C/Java while the actual logic of actions happens in Ruby. This
adds a small amount of overhead but makes it much easier to maintain the lexer.
Even with this extra overhead the performance is much better than pure Ruby.
The 10MB of XML mentioned above is lexed in about 600 miliseconds. In other
words, it's 10 times faster.
This commit is contained in:
Yorick Peterse 2014-05-05 00:25:34 +02:00
parent baaa24a760
commit 2689d3f65a
17 changed files with 656 additions and 512 deletions

View File

@ -7,3 +7,6 @@ trim_trailing_whitespace = true
[*.{y,rb,rl}] [*.{y,rb,rl}]
indent_size = 2 indent_size = 2
[*.{h,h},ext/oga/xml/*.rl]
indent_size = 2

6
.gitignore vendored
View File

@ -3,10 +3,14 @@ coverage
pkg pkg
Gemfile.lock Gemfile.lock
lib/oga/xml/lexer.rb
lib/oga/xml/parser.rb lib/oga/xml/parser.rb
lib/liboga.*
benchmark/fixtures/big.xml benchmark/fixtures/big.xml
profile/samples/*.txt profile/samples/*.txt
profile/samples/*/*.txt profile/samples/*/*.txt
*.so
tmp
ext/liboga/lexer.c

View File

@ -5,6 +5,14 @@ README.md
doc/DCO.md doc/DCO.md
doc/changelog.md doc/changelog.md
doc/css/common.css doc/css/common.css
ext/liboga/extconf.rb
ext/liboga/lexer.c
ext/liboga/lexer.h
ext/liboga/lexer.rl
ext/liboga/liboga.c
ext/liboga/liboga.h
ext/liboga/xml.c
ext/liboga/xml.h
lib/oga.rb lib/oga.rb
lib/oga/html/parser.rb lib/oga/html/parser.rb
lib/oga/version.rb lib/oga/version.rb
@ -14,10 +22,8 @@ lib/oga/xml/doctype.rb
lib/oga/xml/document.rb lib/oga/xml/document.rb
lib/oga/xml/element.rb lib/oga/xml/element.rb
lib/oga/xml/lexer.rb lib/oga/xml/lexer.rb
lib/oga/xml/lexer.rl
lib/oga/xml/node.rb lib/oga/xml/node.rb
lib/oga/xml/parser.rb lib/oga/xml/parser.rb
lib/oga/xml/parser.y
lib/oga/xml/pull_parser.rb lib/oga/xml/pull_parser.rb
lib/oga/xml/text.rb lib/oga/xml/text.rb
lib/oga/xml/xml_declaration.rb lib/oga/xml/xml_declaration.rb

View File

@ -1,33 +1,38 @@
require 'bundler/gem_tasks' require 'bundler/gem_tasks'
require 'digest/sha2' require 'digest/sha2'
require 'rake/clean' require 'rake/clean'
require 'rake/extensiontask'
require 'cliver' require 'cliver'
GEMSPEC = Gem::Specification.load('oga.gemspec') GEMSPEC = Gem::Specification.load('oga.gemspec')
LEXER_OUTPUT = 'lib/oga/xml/lexer.rb'
PARSER_OUTPUT = 'lib/oga/xml/parser.rb' PARSER_OUTPUT = 'lib/oga/xml/parser.rb'
CLEAN.include( CLEAN.include(
'coverage', 'coverage',
'yardoc', 'yardoc',
LEXER_OUTPUT,
PARSER_OUTPUT, PARSER_OUTPUT,
'benchmark/fixtures/big.xml', 'benchmark/fixtures/big.xml',
'profile/samples/**/*.txt' 'profile/samples/**/*.txt',
'lib/liboga.*',
'tmp',
'ext/liboga/lexer.c'
) )
FILE_LIST = FileList.new( FILE_LIST = FileList.new(
'checkum/**/*.*', 'checkum/**/*.*',
'doc/**/*.*', 'doc/**/*.*',
'lib/**/*.*', 'lib/**/*.rb',
'LICENSE', 'LICENSE',
'MANIFEST', 'MANIFEST',
'*.gemspec', '*.gemspec',
'README.md', 'README.md',
'.yardopts' '.yardopts',
'ext/**/*.*'
) )
Rake::ExtensionTask.new('liboga', GEMSPEC)
Dir['./task/*.rake'].each do |task| Dir['./task/*.rake'].each do |task|
import(task) import(task)
end end

13
ext/liboga/extconf.rb Normal file
View File

@ -0,0 +1,13 @@
require 'mkmf'
have_header('ruby.h')
$CFLAGS << ' -Wextra -Wall -pedantic'
if ENV['DEBUG']
$CFLAGS << ' -O0'
else
$CFLAGS << ' -O3 -g'
end
create_makefile('liboga/liboga')

10
ext/liboga/lexer.h Normal file
View File

@ -0,0 +1,10 @@
#include "liboga.h"
#ifndef LIBOGA_XML_LEXER_H
#define LIBOGA_XML_LEXER_H
extern VALUE oga_cLexer;
extern void Init_liboga_xml_lexer();
#endif

298
ext/liboga/lexer.rl Normal file
View File

@ -0,0 +1,298 @@
#include "lexer.h"
VALUE oga_cLexer;
%%machine lexer;
void oga_xml_lexer_callback(
VALUE self,
const char *name,
rb_encoding *encoding,
const char *ts,
const char *te
)
{
int length = te - ts;
VALUE value = rb_enc_str_new_cstr(strndup(ts, length), encoding);
VALUE method = rb_intern(name);
rb_funcall(self, method, 1, value);
}
void oga_xml_lexer_callback_simple(VALUE self, const char *name)
{
VALUE method = rb_intern(name);
rb_funcall(self, method, 0);
}
%% write data;
VALUE oga_xml_lexer_advance(VALUE self)
{
/* Pull the data in from Ruby land. */
VALUE data_ivar = rb_ivar_get(self, rb_intern("@data"));
/* Make sure that all data passed back to Ruby has the proper encoding. */
rb_encoding *encoding = rb_enc_get(data_ivar);
char *data_str_val = StringValuePtr(data_ivar);
const char *p = data_str_val;
const char *pe = data_str_val + strlen(data_str_val);
const char *eof = pe;
const char *ts, *te;
int act = 0;
int cs = 0;
int top = 0;
int stack[8];
%% write init;
%% write exec;
return Qnil;
}
%%{
newline = '\n' | '\r\n';
whitespace = [ \t];
identifier = [a-zA-Z0-9\-_:]+;
# Strings
#
# Strings in HTML can either be single or double quoted. If a string
# starts with one of these quotes it must be closed with the same type
# of quote.
dquote = '"';
squote = "'";
# Machine for processing double quoted strings.
string_dquote := |*
^dquote+ => {
oga_xml_lexer_callback(self, "on_string", encoding, ts, te);
};
dquote => { fret; };
*|;
# Machine for processing single quoted strings.
string_squote := |*
^squote+ => {
oga_xml_lexer_callback(self, "on_string", encoding, ts, te);
};
squote => { fret; };
*|;
# DOCTYPES
#
# http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
#
# These rules support the 3 flavours of doctypes:
#
# 1. Normal doctypes, as introduced in the HTML5 specification.
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
# 3. Legacy doctypes
#
doctype_start = '<!DOCTYPE'i whitespace+;
action start_doctype {
oga_xml_lexer_callback_simple(self, "on_start_doctype");
fcall doctype;
}
# Machine for processing doctypes. Doctype values such as the public
# and system IDs are treated as T_STRING tokens.
doctype := |*
'PUBLIC' | 'SYSTEM' => {
oga_xml_lexer_callback(self, "on_doctype_type", encoding, ts, te);
};
# Lex the public/system IDs as regular strings.
dquote => { fcall string_dquote; };
squote => { fcall string_squote; };
# Whitespace inside doctypes is ignored since there's no point in
# including it.
whitespace;
identifier => {
oga_xml_lexer_callback(self, "on_doctype_name", encoding, ts, te);
};
'>' => {
oga_xml_lexer_callback_simple(self, "on_doctype_end");
fret;
};
*|;
# CDATA
#
# http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
#
# CDATA tags are broken up into 3 parts: the start, the content and the
# end tag.
#
# In HTML CDATA tags have no meaning/are not supported. Oga does
# support them but treats their contents as plain text.
#
cdata_start = '<![CDATA[';
cdata_end = ']]>';
action start_cdata {
oga_xml_lexer_callback_simple(self, "on_cdata_start");
fcall cdata;
}
# Machine that for processing the contents of CDATA tags. Everything
# inside a CDATA tag is treated as plain text.
cdata := |*
any* cdata_end => {
oga_xml_lexer_callback(self, "on_text", encoding, ts, te - 3);
oga_xml_lexer_callback_simple(self, "on_cdata_end");
fret;
};
*|;
# Comments
#
# http://www.w3.org/TR/html-markup/syntax.html#comments
#
# Comments are lexed into 3 parts: the start tag, the content and the
# end tag.
#
# Unlike the W3 specification these rules *do* allow character
# sequences such as `--` and `->`. Putting extra checks in for these
# sequences would actually make the rules/actions more complex.
#
comment_start = '<!--';
comment_end = '-->';
action start_comment {
oga_xml_lexer_callback_simple(self, "on_comment_start");
fcall comment;
}
# Machine used for processing the contents of a comment. Everything
# inside a comment is treated as plain text (similar to CDATA tags).
comment := |*
any* comment_end => {
oga_xml_lexer_callback(self, "on_text", encoding, ts, te - 3);
oga_xml_lexer_callback_simple(self, "on_comment_end");
fret;
};
*|;
# XML declaration tags
#
# http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
#
xml_decl_start = '<?xml';
xml_decl_end = '?>';
action start_xml_decl {
oga_xml_lexer_callback_simple(self, "on_xml_decl_start");
fcall xml_decl;
}
# Machine that processes the contents of an XML declaration tag.
xml_decl := |*
xml_decl_end => {
oga_xml_lexer_callback_simple(self, "on_xml_decl_end");
fret;
};
# Attributes and their values (e.g. version="1.0").
identifier => {
oga_xml_lexer_callback(self, "on_attribute", encoding, ts, te);
};
dquote => { fcall string_dquote; };
squote => { fcall string_squote; };
any;
*|;
# Elements
#
# http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
#
# Action that creates the tokens for the opening tag, name and
# namespace (if any). Remaining work is delegated to a dedicated
# machine.
action start_element {
oga_xml_lexer_callback(self, "on_element_start", encoding, ts + 1, te);
fcall element_head;
}
element_start = '<' identifier;
# Machine used for processing the characters inside a element head. An
# element head is everything between `<NAME` (where NAME is the element
# name) and `>`.
#
# For example, in `<p foo="bar">` the element head is ` foo="bar"`.
#
element_head := |*
whitespace | '=';
newline => {
oga_xml_lexer_callback_simple(self, "on_newline");
};
# Attribute names.
identifier => {
oga_xml_lexer_callback(self, "on_attribute", encoding, ts, te);
};
# Attribute values.
dquote => { fcall string_dquote; };
squote => { fcall string_squote; };
# The closing character of the open tag.
('>' | '/') => {
fhold;
fret;
};
*|;
main := |*
element_start => start_element;
doctype_start => start_doctype;
cdata_start => start_cdata;
comment_start => start_comment;
xml_decl_start => start_xml_decl;
# Enter the body of the tag. If HTML mode is enabled and the current
# element is a void element we'll close it and bail out.
'>' => {
oga_xml_lexer_callback_simple(self, "on_element_open_end");
};
# Regular closing tags.
'</' identifier '>' => {
oga_xml_lexer_callback_simple(self, "on_element_end");
};
# Self closing elements that are not handled by the HTML mode.
'/>' => {
oga_xml_lexer_callback_simple(self, "on_element_end");
};
# Note that this rule should be declared at the very bottom as it
# will otherwise take precedence over the other rules.
^('<' | '>')+ => {
oga_xml_lexer_callback(self, "on_text", encoding, ts, te);
};
*|;
}%%
void Init_liboga_xml_lexer()
{
oga_cLexer = rb_define_class_under(oga_mXML, "Lexer", rb_cObject);
rb_define_method(oga_cLexer, "advance_native", oga_xml_lexer_advance, 0);
}

11
ext/liboga/liboga.c Normal file
View File

@ -0,0 +1,11 @@
#include "liboga.h"
VALUE oga_mOga;
void Init_liboga()
{
oga_mOga = rb_define_module("Oga");
Init_liboga_xml();
Init_liboga_xml_lexer();
}

17
ext/liboga/liboga.h Normal file
View File

@ -0,0 +1,17 @@
#ifndef LIBOGA_H
#define LIBOGA_H
#include <ruby.h>
#include <ruby/encoding.h>
#include <string.h>
#include <malloc.h>
#include <stdio.h>
extern VALUE oga_mOga;
#include "xml.h"
#include "lexer.h"
void Init_liboga();
#endif

8
ext/liboga/xml.c Normal file
View File

@ -0,0 +1,8 @@
#include "xml.h"
VALUE oga_mXML;
void Init_liboga_xml()
{
oga_mXML = rb_define_module_under(oga_mOga, "XML");
}

10
ext/liboga/xml.h Normal file
View File

@ -0,0 +1,10 @@
#ifndef LIBOGA_XML_H
#define LIBOGA_XML_H
#include "liboga.h"
extern VALUE oga_mXML;
void Init_liboga_xml();
#endif

View File

@ -1,5 +1,7 @@
require 'set' require 'set'
require_relative 'liboga'
require_relative 'oga/xml/lexer' require_relative 'oga/xml/lexer'
require_relative 'oga/xml/parser' require_relative 'oga/xml/parser'
require_relative 'oga/xml/pull_parser' require_relative 'oga/xml/pull_parser'

249
lib/oga/xml/lexer.rb Normal file
View File

@ -0,0 +1,249 @@
module Oga
module XML
##
# Low level lexer that supports both XML and HTML (using an extra option).
# To lex HTML input set the `:html` option to `true` when creating an
# instance of the lexer:
#
# lexer = Oga::XML::Lexer.new(:html => true)
#
# @!attribute [r] html
# @return [TrueClass|FalseClass]
#
# @!attribute [r] tokens
# @return [Array]
#
class Lexer
attr_reader :html
##
# Names of the HTML void elements that should be handled when HTML lexing
# is enabled.
#
# @return [Set]
#
HTML_VOID_ELEMENTS = Set.new([
'area',
'base',
'br',
'col',
'command',
'embed',
'hr',
'img',
'input',
'keygen',
'link',
'meta',
'param',
'source',
'track',
'wbr'
])
##
# @param [String] data The data to lex.
#
# @param [Hash] options
#
# @option options [Symbol] :html When set to `true` the lexer will treat
# the input as HTML instead of SGML/XML. This makes it possible to lex
# HTML void elements such as `<link href="">`.
#
def initialize(data, options = {})
options.each do |key, value|
instance_variable_set("@#{key}", value) if respond_to?(key)
end
@data = data
reset
end
##
# Resets the internal state of the lexer. Typically you don't need to
# call this method yourself as its called by #lex after lexing a given
# String.
#
def reset
@line = 1
@elements = []
end
##
# Gathers all the tokens for the input and returns them as an Array.
#
# This method resets the internal state of the lexer after consuming the
# input.
#
# @param [String] data The string to consume.
# @return [Array]
# @see #advance
#
def lex
tokens = []
advance do |token|
tokens << token
end
reset
return tokens
end
##
# Advances through the input and generates the corresponding tokens. Each
# token is yielded to the supplied block.
#
# Each token is an Array in the following format:
#
# [TYPE, VALUE]
#
# The type is a symbol, the value is either nil or a String.
#
# This method stores the supplied block in `@block` and resets it after
# the lexer loop has finished.
#
# This method does *not* reset the internal state of the lexer.
#
#
# @param [String] data The String to consume.
# @return [Array]
#
def advance(&block)
@block = block
advance_native
ensure
@block = nil
end
##
# @return [TrueClass|FalseClass]
#
def html?
return !!html
end
private
##
# @param [Fixnum] amount The amount of lines to advance.
#
def advance_line(amount = 1)
@line += amount
end
##
# Adds a token with the given type and value to the list.
#
# @param [Symbol] type The token type.
# @param [String] value The token value.
#
def add_token(type, value = nil)
token = [type, value, @line]
@block.call(token)
end
##
# Returns the name of the element we're currently in.
#
# @return [String]
#
def current_element
return @elements.last
end
def on_string(value)
add_token(:T_STRING, value)
end
def on_start_doctype
add_token(:T_DOCTYPE_START)
end
def on_doctype_type(value)
add_token(:T_DOCTYPE_TYPE, value)
end
def on_doctype_name(value)
add_token(:T_DOCTYPE_NAME, value)
end
def on_doctype_end
add_token(:T_DOCTYPE_END)
end
def on_cdata_start
add_token(:T_CDATA_START)
end
def on_cdata_end
add_token(:T_CDATA_END)
end
def on_comment_start
add_token(:T_COMMENT_START)
end
def on_comment_end
add_token(:T_COMMENT_END)
end
def on_xml_decl_start
add_token(:T_XML_DECL_START)
end
def on_xml_decl_end
add_token(:T_XML_DECL_END)
end
def on_element_start(name)
add_token(:T_ELEM_START)
if name.include?(':')
ns, name = name.split(':')
add_token(:T_ELEM_NS, ns)
end
@elements << name if html?
add_token(:T_ELEM_NAME, name)
end
def on_element_open_end
if html? and HTML_VOID_ELEMENTS.include?(current_element)
add_token(:T_ELEM_END)
@elements.pop
end
end
def on_element_end
add_token(:T_ELEM_END)
@elements.pop if html?
end
def on_text(value)
unless value.empty?
add_token(:T_TEXT, value)
lines = value.count("\n")
advance_line(lines) if lines > 0
end
end
def on_attribute(value)
add_token(:T_ATTR, value)
end
def on_newline
@line += 1
end
end # Lexer
end # XML
end # Oga

View File

@ -1,501 +0,0 @@
%%machine lexer; # %
module Oga
module XML
##
# Low level lexer that supports both XML and HTML (using an extra option).
# To lex HTML input set the `:html` option to `true` when creating an
# instance of the lexer:
#
# lexer = Oga::XML::Lexer.new(:html => true)
#
# @!attribute [r] html
# @return [TrueClass|FalseClass]
#
# @!attribute [r] tokens
# @return [Array]
#
class Lexer
%% write data;
# % fix highlight
attr_reader :html
##
# Names of the HTML void elements that should be handled when HTML lexing
# is enabled.
#
# @return [Set]
#
HTML_VOID_ELEMENTS = Set.new([
'area',
'base',
'br',
'col',
'command',
'embed',
'hr',
'img',
'input',
'keygen',
'link',
'meta',
'param',
'source',
'track',
'wbr'
])
##
# @param [String] data The data to lex.
#
# @param [Hash] options
#
# @option options [Symbol] :html When set to `true` the lexer will treat
# the input as HTML instead of SGML/XML. This makes it possible to lex
# HTML void elements such as `<link href="">`.
#
def initialize(data, options = {})
options.each do |key, value|
instance_variable_set("@#{key}", value) if respond_to?(key)
end
@data = data
reset
end
##
# Resets the internal state of the lexer. Typically you don't need to
# call this method yourself as its called by #lex after lexing a given
# String.
#
def reset
@line = 1
@elements = []
@buffer_start_position = nil
end
##
# Gathers all the tokens for the input and returns them as an Array.
#
# This method resets the internal state of the lexer after consuming the
# input.
#
# @param [String] data The string to consume.
# @return [Array]
# @see #advance
#
def lex
tokens = []
advance do |token|
tokens << token
end
reset
return tokens
end
##
# Advances through the input and generates the corresponding tokens. Each
# token is yielded to the supplied block.
#
# Each token is an Array in the following format:
#
# [TYPE, VALUE]
#
# The type is a symbol, the value is either nil or a String.
#
# This method stores the supplied block in `@block` and resets it after
# the lexer loop has finished.
#
# This method does *not* reset the internal state of the lexer.
#
#
# @param [String] data The String to consume.
# @return [Array]
#
def advance(&block)
@block = block
data = @data
ts = nil
te = nil
stack = []
top = 0
cs = self.class.lexer_start
act = 0
eof = @data.bytesize
p = 0
pe = eof
_lexer_eof_trans = self.class.send(:_lexer_eof_trans)
_lexer_from_state_actions = self.class.send(:_lexer_from_state_actions)
_lexer_index_offsets = self.class.send(:_lexer_index_offsets)
_lexer_indicies = self.class.send(:_lexer_indicies)
_lexer_key_spans = self.class.send(:_lexer_key_spans)
_lexer_to_state_actions = self.class.send(:_lexer_to_state_actions)
_lexer_trans_actions = self.class.send(:_lexer_trans_actions)
_lexer_trans_keys = self.class.send(:_lexer_trans_keys)
_lexer_trans_targs = self.class.send(:_lexer_trans_targs)
%% write exec;
# % fix highlight
ensure
@block = nil
end
##
# @return [TrueClass|FalseClass]
#
def html?
return !!html
end
private
##
# @param [Fixnum] amount The amount of lines to advance.
#
def advance_line(amount = 1)
@line += amount
end
##
# Emits a token who's value is based on the supplied start/stop position.
#
# @param [Symbol] type The token type.
# @param [Fixnum] start
# @param [Fixnum] stop
#
# @see #text
# @see #add_token
#
def emit(type, start, stop)
value = text(start, stop)
add_token(type, value)
end
##
# Returns the text of the current buffer based on the supplied start and
# stop position.
#
# @param [Fixnum] start
# @param [Fixnum] stop
# @return [String]
#
def text(start, stop)
return @data.byteslice(start, stop - start)
end
##
# Adds a token with the given type and value to the list.
#
# @param [Symbol] type The token type.
# @param [String] value The token value.
#
def add_token(type, value = nil)
token = [type, value, @line]
@block.call(token)
end
##
# Enables buffering starting at the given position.
#
# @param [Fixnum] position The start position of the buffer.
#
def start_buffer(position)
@buffer_start_position = position
end
##
# Emits a text token.
#
# @param [Fixnum] start
# @param [Fixnum] stop
#
def emit_text(start, stop)
content = text(start, stop)
unless content.empty?
add_token(:T_TEXT, content)
lines = content.count("\n")
advance_line(lines) if lines > 0
end
end
##
# Returns the name of the element we're currently in.
#
# @return [String]
#
def current_element
return @elements.last
end
%%{
getkey (data.getbyte(p) || 0);
newline = '\n' | '\r\n';
whitespace = [ \t];
identifier = [a-zA-Z0-9\-_:]+;
# Strings
#
# Strings in HTML can either be single or double quoted. If a string
# starts with one of these quotes it must be closed with the same type
# of quote.
dquote = '"';
squote = "'";
# Machine for processing double quoted strings.
string_dquote := |*
^dquote+ => {
emit(:T_STRING, ts, te)
};
dquote => { fret; };
*|;
# Machine for processing single quoted strings.
string_squote := |*
^squote+ => {
emit(:T_STRING, ts, te)
};
squote => { fret; };
*|;
# DOCTYPES
#
# http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
#
# These rules support the 3 flavours of doctypes:
#
# 1. Normal doctypes, as introduced in the HTML5 specification.
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
# 3. Legacy doctypes
#
doctype_start = '<!DOCTYPE'i whitespace+;
action start_doctype {
add_token(:T_DOCTYPE_START)
fcall doctype;
}
# Machine for processing doctypes. Doctype values such as the public
# and system IDs are treated as T_STRING tokens.
doctype := |*
'PUBLIC' | 'SYSTEM' => { emit(:T_DOCTYPE_TYPE, ts, te) };
# Lex the public/system IDs as regular strings.
dquote => { fcall string_dquote; };
squote => { fcall string_squote; };
# Whitespace inside doctypes is ignored since there's no point in
# including it.
whitespace;
identifier => { emit(:T_DOCTYPE_NAME, ts, te) };
'>' => {
add_token(:T_DOCTYPE_END)
fret;
};
*|;
# CDATA
#
# http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
#
# CDATA tags are broken up into 3 parts: the start, the content and the
# end tag.
#
# In HTML CDATA tags have no meaning/are not supported. Oga does
# support them but treats their contents as plain text.
#
cdata_start = '<![CDATA[';
cdata_end = ']]>';
action start_cdata {
add_token(:T_CDATA_START)
fcall cdata;
}
# Machine that for processing the contents of CDATA tags. Everything
# inside a CDATA tag is treated as plain text.
cdata := |*
any* cdata_end => {
emit_text(ts, te - 3)
add_token(:T_CDATA_END)
fret;
};
*|;
# Comments
#
# http://www.w3.org/TR/html-markup/syntax.html#comments
#
# Comments are lexed into 3 parts: the start tag, the content and the
# end tag.
#
# Unlike the W3 specification these rules *do* allow character
# sequences such as `--` and `->`. Putting extra checks in for these
# sequences would actually make the rules/actions more complex.
#
comment_start = '<!--';
comment_end = '-->';
action start_comment {
add_token(:T_COMMENT_START)
fcall comment;
}
# Machine used for processing the contents of a comment. Everything
# inside a comment is treated as plain text (similar to CDATA tags).
comment := |*
any* comment_end => {
emit_text(ts, te - 3)
add_token(:T_COMMENT_END)
fret;
};
*|;
# XML declaration tags
#
# http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
#
xml_decl_start = '<?xml';
xml_decl_end = '?>';
action start_xml_decl {
add_token(:T_XML_DECL_START)
fcall xml_decl;
}
# Machine that processes the contents of an XML declaration tag.
xml_decl := |*
xml_decl_end => {
add_token(:T_XML_DECL_END)
fret;
};
# Attributes and their values (e.g. version="1.0").
identifier => { emit(:T_ATTR, ts, te) };
dquote => { fcall string_dquote; };
squote => { fcall string_squote; };
any;
*|;
# Elements
#
# http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
#
# Action that creates the tokens for the opening tag, name and
# namespace (if any). Remaining work is delegated to a dedicated
# machine.
action start_element {
add_token(:T_ELEM_START)
# Add the element name. If the name includes a namespace we'll break
# the name up into two separate tokens.
name = text(ts + 1, te)
if name.include?(':')
ns, name = name.split(':')
add_token(:T_ELEM_NS, ns)
end
@elements << name if html?
add_token(:T_ELEM_NAME, name)
fcall element_head;
}
element_start = '<' identifier;
# Machine used for processing the characters inside a element head. An
# element head is everything between `<NAME` (where NAME is the element
# name) and `>`.
#
# For example, in `<p foo="bar">` the element head is ` foo="bar"`.
#
element_head := |*
whitespace | '=';
newline => { advance_line };
# Attribute names.
identifier => { emit(:T_ATTR, ts, te) };
# Attribute values.
dquote => { fcall string_dquote; };
squote => { fcall string_squote; };
# The closing character of the open tag.
('>' | '/') => {
fhold;
fret;
};
*|;
main := |*
element_start => start_element;
doctype_start => start_doctype;
cdata_start => start_cdata;
comment_start => start_comment;
xml_decl_start => start_xml_decl;
# Enter the body of the tag. If HTML mode is enabled and the current
# element is a void element we'll close it and bail out.
'>' => {
if html? and HTML_VOID_ELEMENTS.include?(current_element)
add_token(:T_ELEM_END, nil)
@elements.pop
end
};
# Regular closing tags.
'</' identifier '>' => {
add_token(:T_ELEM_END, nil)
@elements.pop if html?
};
# Self closing elements that are not handled by the HTML mode.
'/>' => {
add_token(:T_ELEM_END, nil)
@elements.pop if html?
};
# Note that this rule should be declared at the very bottom as it
# will otherwise take precedence over the other rules.
^('<' | '>')+ => {
emit_text(ts, te)
};
*|;
}%%
end # Lexer
end # XML
end # Oga

View File

@ -12,6 +12,8 @@ Gem::Specification.new do |s|
s.files = File.read(File.expand_path('../MANIFEST', __FILE__)).split("\n") s.files = File.read(File.expand_path('../MANIFEST', __FILE__)).split("\n")
s.extensions = ['ext/liboga/extconf.rb']
s.has_rdoc = 'yard' s.has_rdoc = 'yard'
s.required_ruby_version = '>= 1.9.3' s.required_ruby_version = '>= 1.9.3'
@ -24,4 +26,5 @@ Gem::Specification.new do |s|
s.add_development_dependency 'simplecov' s.add_development_dependency 'simplecov'
s.add_development_dependency 'kramdown' s.add_development_dependency 'kramdown'
s.add_development_dependency 'benchmark-ips' s.add_development_dependency 'benchmark-ips'
s.add_development_dependency 'rake-compiler'
end end

View File

@ -18,5 +18,11 @@ rule '.rb' => '.rl' do |task|
end end
end end
desc 'Generates the lexer' rule '.c' => '.rl' do |task|
task :lexer => [LEXER_OUTPUT] Cliver.assert('ragel', '~> 6.7')
sh "ragel -C -G2 #{task.source} -o #{task.name}"
end
desc 'Generates the lexers'
task :lexer => ['ext/liboga/lexer.c']

View File

@ -1,4 +1,4 @@
desc 'Runs the tests' desc 'Runs the tests'
task :test => [:generate] do task :test => [:generate, :compile] do
sh 'rspec spec' sh 'rspec spec'
end end