Initial setup using a C extension.
While I've tried to keep Oga pure Ruby for as long as possible the performance of Ragel's Ruby output was not worth the trouble. For example, lexing 10MB of XML would take 5 to 6 seconds at least. Nokogiri on the other hand can parse that same XML into a DOM document in about 300 miliseconds. Such a big performance difference is not acceptable. To work around this the XML/HTML lexer will be implemented in C for MRI/Rubinius and Java for JRuby. For now there's only a C extension as I haven't read up yet on the JRuby API. The end goal is to provide some sort of Ragel "template" that can be used to generate the corresponding C/Java extension code. This would remove the need of duplicating the grammar and associated code. The native extension setup is a hybrid between native and Ruby. The raw Ragel stuff happens in C/Java while the actual logic of actions happens in Ruby. This adds a small amount of overhead but makes it much easier to maintain the lexer. Even with this extra overhead the performance is much better than pure Ruby. The 10MB of XML mentioned above is lexed in about 600 miliseconds. In other words, it's 10 times faster.
This commit is contained in:
parent
baaa24a760
commit
2689d3f65a
|
@ -7,3 +7,6 @@ trim_trailing_whitespace = true
|
|||
|
||||
[*.{y,rb,rl}]
|
||||
indent_size = 2
|
||||
|
||||
[*.{h,h},ext/oga/xml/*.rl]
|
||||
indent_size = 2
|
||||
|
|
|
@ -3,10 +3,14 @@ coverage
|
|||
pkg
|
||||
Gemfile.lock
|
||||
|
||||
lib/oga/xml/lexer.rb
|
||||
lib/oga/xml/parser.rb
|
||||
lib/liboga.*
|
||||
|
||||
benchmark/fixtures/big.xml
|
||||
|
||||
profile/samples/*.txt
|
||||
profile/samples/*/*.txt
|
||||
*.so
|
||||
tmp
|
||||
|
||||
ext/liboga/lexer.c
|
||||
|
|
10
MANIFEST
10
MANIFEST
|
@ -5,6 +5,14 @@ README.md
|
|||
doc/DCO.md
|
||||
doc/changelog.md
|
||||
doc/css/common.css
|
||||
ext/liboga/extconf.rb
|
||||
ext/liboga/lexer.c
|
||||
ext/liboga/lexer.h
|
||||
ext/liboga/lexer.rl
|
||||
ext/liboga/liboga.c
|
||||
ext/liboga/liboga.h
|
||||
ext/liboga/xml.c
|
||||
ext/liboga/xml.h
|
||||
lib/oga.rb
|
||||
lib/oga/html/parser.rb
|
||||
lib/oga/version.rb
|
||||
|
@ -14,10 +22,8 @@ lib/oga/xml/doctype.rb
|
|||
lib/oga/xml/document.rb
|
||||
lib/oga/xml/element.rb
|
||||
lib/oga/xml/lexer.rb
|
||||
lib/oga/xml/lexer.rl
|
||||
lib/oga/xml/node.rb
|
||||
lib/oga/xml/parser.rb
|
||||
lib/oga/xml/parser.y
|
||||
lib/oga/xml/pull_parser.rb
|
||||
lib/oga/xml/text.rb
|
||||
lib/oga/xml/xml_declaration.rb
|
||||
|
|
15
Rakefile
15
Rakefile
|
@ -1,33 +1,38 @@
|
|||
require 'bundler/gem_tasks'
|
||||
require 'digest/sha2'
|
||||
require 'rake/clean'
|
||||
require 'rake/extensiontask'
|
||||
require 'cliver'
|
||||
|
||||
GEMSPEC = Gem::Specification.load('oga.gemspec')
|
||||
|
||||
LEXER_OUTPUT = 'lib/oga/xml/lexer.rb'
|
||||
PARSER_OUTPUT = 'lib/oga/xml/parser.rb'
|
||||
|
||||
CLEAN.include(
|
||||
'coverage',
|
||||
'yardoc',
|
||||
LEXER_OUTPUT,
|
||||
PARSER_OUTPUT,
|
||||
'benchmark/fixtures/big.xml',
|
||||
'profile/samples/**/*.txt'
|
||||
'profile/samples/**/*.txt',
|
||||
'lib/liboga.*',
|
||||
'tmp',
|
||||
'ext/liboga/lexer.c'
|
||||
)
|
||||
|
||||
FILE_LIST = FileList.new(
|
||||
'checkum/**/*.*',
|
||||
'doc/**/*.*',
|
||||
'lib/**/*.*',
|
||||
'lib/**/*.rb',
|
||||
'LICENSE',
|
||||
'MANIFEST',
|
||||
'*.gemspec',
|
||||
'README.md',
|
||||
'.yardopts'
|
||||
'.yardopts',
|
||||
'ext/**/*.*'
|
||||
)
|
||||
|
||||
Rake::ExtensionTask.new('liboga', GEMSPEC)
|
||||
|
||||
Dir['./task/*.rake'].each do |task|
|
||||
import(task)
|
||||
end
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
require 'mkmf'
|
||||
|
||||
have_header('ruby.h')
|
||||
|
||||
$CFLAGS << ' -Wextra -Wall -pedantic'
|
||||
|
||||
if ENV['DEBUG']
|
||||
$CFLAGS << ' -O0'
|
||||
else
|
||||
$CFLAGS << ' -O3 -g'
|
||||
end
|
||||
|
||||
create_makefile('liboga/liboga')
|
|
@ -0,0 +1,10 @@
|
|||
#include "liboga.h"
|
||||
|
||||
#ifndef LIBOGA_XML_LEXER_H
|
||||
#define LIBOGA_XML_LEXER_H
|
||||
|
||||
extern VALUE oga_cLexer;
|
||||
|
||||
extern void Init_liboga_xml_lexer();
|
||||
|
||||
#endif
|
|
@ -0,0 +1,298 @@
|
|||
#include "lexer.h"
|
||||
|
||||
VALUE oga_cLexer;
|
||||
|
||||
%%machine lexer;
|
||||
|
||||
void oga_xml_lexer_callback(
|
||||
VALUE self,
|
||||
const char *name,
|
||||
rb_encoding *encoding,
|
||||
const char *ts,
|
||||
const char *te
|
||||
)
|
||||
{
|
||||
int length = te - ts;
|
||||
VALUE value = rb_enc_str_new_cstr(strndup(ts, length), encoding);
|
||||
VALUE method = rb_intern(name);
|
||||
|
||||
rb_funcall(self, method, 1, value);
|
||||
}
|
||||
|
||||
void oga_xml_lexer_callback_simple(VALUE self, const char *name)
|
||||
{
|
||||
VALUE method = rb_intern(name);
|
||||
|
||||
rb_funcall(self, method, 0);
|
||||
}
|
||||
|
||||
%% write data;
|
||||
|
||||
VALUE oga_xml_lexer_advance(VALUE self)
|
||||
{
|
||||
/* Pull the data in from Ruby land. */
|
||||
VALUE data_ivar = rb_ivar_get(self, rb_intern("@data"));
|
||||
|
||||
/* Make sure that all data passed back to Ruby has the proper encoding. */
|
||||
rb_encoding *encoding = rb_enc_get(data_ivar);
|
||||
|
||||
char *data_str_val = StringValuePtr(data_ivar);
|
||||
|
||||
const char *p = data_str_val;
|
||||
const char *pe = data_str_val + strlen(data_str_val);
|
||||
const char *eof = pe;
|
||||
const char *ts, *te;
|
||||
|
||||
int act = 0;
|
||||
int cs = 0;
|
||||
int top = 0;
|
||||
int stack[8];
|
||||
|
||||
%% write init;
|
||||
%% write exec;
|
||||
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
%%{
|
||||
newline = '\n' | '\r\n';
|
||||
whitespace = [ \t];
|
||||
identifier = [a-zA-Z0-9\-_:]+;
|
||||
|
||||
# Strings
|
||||
#
|
||||
# Strings in HTML can either be single or double quoted. If a string
|
||||
# starts with one of these quotes it must be closed with the same type
|
||||
# of quote.
|
||||
dquote = '"';
|
||||
squote = "'";
|
||||
|
||||
# Machine for processing double quoted strings.
|
||||
string_dquote := |*
|
||||
^dquote+ => {
|
||||
oga_xml_lexer_callback(self, "on_string", encoding, ts, te);
|
||||
};
|
||||
|
||||
dquote => { fret; };
|
||||
*|;
|
||||
|
||||
# Machine for processing single quoted strings.
|
||||
string_squote := |*
|
||||
^squote+ => {
|
||||
oga_xml_lexer_callback(self, "on_string", encoding, ts, te);
|
||||
};
|
||||
|
||||
squote => { fret; };
|
||||
*|;
|
||||
|
||||
# DOCTYPES
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
|
||||
#
|
||||
# These rules support the 3 flavours of doctypes:
|
||||
#
|
||||
# 1. Normal doctypes, as introduced in the HTML5 specification.
|
||||
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
|
||||
# 3. Legacy doctypes
|
||||
#
|
||||
doctype_start = '<!DOCTYPE'i whitespace+;
|
||||
|
||||
action start_doctype {
|
||||
oga_xml_lexer_callback_simple(self, "on_start_doctype");
|
||||
fcall doctype;
|
||||
}
|
||||
|
||||
# Machine for processing doctypes. Doctype values such as the public
|
||||
# and system IDs are treated as T_STRING tokens.
|
||||
doctype := |*
|
||||
'PUBLIC' | 'SYSTEM' => {
|
||||
oga_xml_lexer_callback(self, "on_doctype_type", encoding, ts, te);
|
||||
};
|
||||
|
||||
# Lex the public/system IDs as regular strings.
|
||||
dquote => { fcall string_dquote; };
|
||||
squote => { fcall string_squote; };
|
||||
|
||||
# Whitespace inside doctypes is ignored since there's no point in
|
||||
# including it.
|
||||
whitespace;
|
||||
|
||||
identifier => {
|
||||
oga_xml_lexer_callback(self, "on_doctype_name", encoding, ts, te);
|
||||
};
|
||||
|
||||
'>' => {
|
||||
oga_xml_lexer_callback_simple(self, "on_doctype_end");
|
||||
fret;
|
||||
};
|
||||
*|;
|
||||
|
||||
# CDATA
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
|
||||
#
|
||||
# CDATA tags are broken up into 3 parts: the start, the content and the
|
||||
# end tag.
|
||||
#
|
||||
# In HTML CDATA tags have no meaning/are not supported. Oga does
|
||||
# support them but treats their contents as plain text.
|
||||
#
|
||||
cdata_start = '<![CDATA[';
|
||||
cdata_end = ']]>';
|
||||
|
||||
action start_cdata {
|
||||
oga_xml_lexer_callback_simple(self, "on_cdata_start");
|
||||
fcall cdata;
|
||||
}
|
||||
|
||||
# Machine that for processing the contents of CDATA tags. Everything
|
||||
# inside a CDATA tag is treated as plain text.
|
||||
cdata := |*
|
||||
any* cdata_end => {
|
||||
oga_xml_lexer_callback(self, "on_text", encoding, ts, te - 3);
|
||||
oga_xml_lexer_callback_simple(self, "on_cdata_end");
|
||||
fret;
|
||||
};
|
||||
*|;
|
||||
|
||||
# Comments
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#comments
|
||||
#
|
||||
# Comments are lexed into 3 parts: the start tag, the content and the
|
||||
# end tag.
|
||||
#
|
||||
# Unlike the W3 specification these rules *do* allow character
|
||||
# sequences such as `--` and `->`. Putting extra checks in for these
|
||||
# sequences would actually make the rules/actions more complex.
|
||||
#
|
||||
comment_start = '<!--';
|
||||
comment_end = '-->';
|
||||
|
||||
action start_comment {
|
||||
oga_xml_lexer_callback_simple(self, "on_comment_start");
|
||||
fcall comment;
|
||||
}
|
||||
|
||||
# Machine used for processing the contents of a comment. Everything
|
||||
# inside a comment is treated as plain text (similar to CDATA tags).
|
||||
comment := |*
|
||||
any* comment_end => {
|
||||
oga_xml_lexer_callback(self, "on_text", encoding, ts, te - 3);
|
||||
oga_xml_lexer_callback_simple(self, "on_comment_end");
|
||||
fret;
|
||||
};
|
||||
*|;
|
||||
|
||||
# XML declaration tags
|
||||
#
|
||||
# http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
|
||||
#
|
||||
xml_decl_start = '<?xml';
|
||||
xml_decl_end = '?>';
|
||||
|
||||
action start_xml_decl {
|
||||
oga_xml_lexer_callback_simple(self, "on_xml_decl_start");
|
||||
fcall xml_decl;
|
||||
}
|
||||
|
||||
# Machine that processes the contents of an XML declaration tag.
|
||||
xml_decl := |*
|
||||
xml_decl_end => {
|
||||
oga_xml_lexer_callback_simple(self, "on_xml_decl_end");
|
||||
fret;
|
||||
};
|
||||
|
||||
# Attributes and their values (e.g. version="1.0").
|
||||
identifier => {
|
||||
oga_xml_lexer_callback(self, "on_attribute", encoding, ts, te);
|
||||
};
|
||||
|
||||
dquote => { fcall string_dquote; };
|
||||
squote => { fcall string_squote; };
|
||||
|
||||
any;
|
||||
*|;
|
||||
|
||||
# Elements
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
|
||||
#
|
||||
|
||||
# Action that creates the tokens for the opening tag, name and
|
||||
# namespace (if any). Remaining work is delegated to a dedicated
|
||||
# machine.
|
||||
action start_element {
|
||||
oga_xml_lexer_callback(self, "on_element_start", encoding, ts + 1, te);
|
||||
|
||||
fcall element_head;
|
||||
}
|
||||
|
||||
element_start = '<' identifier;
|
||||
|
||||
# Machine used for processing the characters inside a element head. An
|
||||
# element head is everything between `<NAME` (where NAME is the element
|
||||
# name) and `>`.
|
||||
#
|
||||
# For example, in `<p foo="bar">` the element head is ` foo="bar"`.
|
||||
#
|
||||
element_head := |*
|
||||
whitespace | '=';
|
||||
|
||||
newline => {
|
||||
oga_xml_lexer_callback_simple(self, "on_newline");
|
||||
};
|
||||
|
||||
# Attribute names.
|
||||
identifier => {
|
||||
oga_xml_lexer_callback(self, "on_attribute", encoding, ts, te);
|
||||
};
|
||||
|
||||
# Attribute values.
|
||||
dquote => { fcall string_dquote; };
|
||||
squote => { fcall string_squote; };
|
||||
|
||||
# The closing character of the open tag.
|
||||
('>' | '/') => {
|
||||
fhold;
|
||||
fret;
|
||||
};
|
||||
*|;
|
||||
|
||||
main := |*
|
||||
element_start => start_element;
|
||||
doctype_start => start_doctype;
|
||||
cdata_start => start_cdata;
|
||||
comment_start => start_comment;
|
||||
xml_decl_start => start_xml_decl;
|
||||
|
||||
# Enter the body of the tag. If HTML mode is enabled and the current
|
||||
# element is a void element we'll close it and bail out.
|
||||
'>' => {
|
||||
oga_xml_lexer_callback_simple(self, "on_element_open_end");
|
||||
};
|
||||
|
||||
# Regular closing tags.
|
||||
'</' identifier '>' => {
|
||||
oga_xml_lexer_callback_simple(self, "on_element_end");
|
||||
};
|
||||
|
||||
# Self closing elements that are not handled by the HTML mode.
|
||||
'/>' => {
|
||||
oga_xml_lexer_callback_simple(self, "on_element_end");
|
||||
};
|
||||
|
||||
# Note that this rule should be declared at the very bottom as it
|
||||
# will otherwise take precedence over the other rules.
|
||||
^('<' | '>')+ => {
|
||||
oga_xml_lexer_callback(self, "on_text", encoding, ts, te);
|
||||
};
|
||||
*|;
|
||||
}%%
|
||||
|
||||
void Init_liboga_xml_lexer()
|
||||
{
|
||||
oga_cLexer = rb_define_class_under(oga_mXML, "Lexer", rb_cObject);
|
||||
|
||||
rb_define_method(oga_cLexer, "advance_native", oga_xml_lexer_advance, 0);
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
#include "liboga.h"
|
||||
|
||||
VALUE oga_mOga;
|
||||
|
||||
void Init_liboga()
|
||||
{
|
||||
oga_mOga = rb_define_module("Oga");
|
||||
|
||||
Init_liboga_xml();
|
||||
Init_liboga_xml_lexer();
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
#ifndef LIBOGA_H
|
||||
#define LIBOGA_H
|
||||
|
||||
#include <ruby.h>
|
||||
#include <ruby/encoding.h>
|
||||
#include <string.h>
|
||||
#include <malloc.h>
|
||||
#include <stdio.h>
|
||||
|
||||
extern VALUE oga_mOga;
|
||||
|
||||
#include "xml.h"
|
||||
#include "lexer.h"
|
||||
|
||||
void Init_liboga();
|
||||
|
||||
#endif
|
|
@ -0,0 +1,8 @@
|
|||
#include "xml.h"
|
||||
|
||||
VALUE oga_mXML;
|
||||
|
||||
void Init_liboga_xml()
|
||||
{
|
||||
oga_mXML = rb_define_module_under(oga_mOga, "XML");
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
#ifndef LIBOGA_XML_H
|
||||
#define LIBOGA_XML_H
|
||||
|
||||
#include "liboga.h"
|
||||
|
||||
extern VALUE oga_mXML;
|
||||
|
||||
void Init_liboga_xml();
|
||||
|
||||
#endif
|
|
@ -1,5 +1,7 @@
|
|||
require 'set'
|
||||
|
||||
require_relative 'liboga'
|
||||
|
||||
require_relative 'oga/xml/lexer'
|
||||
require_relative 'oga/xml/parser'
|
||||
require_relative 'oga/xml/pull_parser'
|
||||
|
|
|
@ -0,0 +1,249 @@
|
|||
module Oga
|
||||
module XML
|
||||
##
|
||||
# Low level lexer that supports both XML and HTML (using an extra option).
|
||||
# To lex HTML input set the `:html` option to `true` when creating an
|
||||
# instance of the lexer:
|
||||
#
|
||||
# lexer = Oga::XML::Lexer.new(:html => true)
|
||||
#
|
||||
# @!attribute [r] html
|
||||
# @return [TrueClass|FalseClass]
|
||||
#
|
||||
# @!attribute [r] tokens
|
||||
# @return [Array]
|
||||
#
|
||||
class Lexer
|
||||
attr_reader :html
|
||||
|
||||
##
|
||||
# Names of the HTML void elements that should be handled when HTML lexing
|
||||
# is enabled.
|
||||
#
|
||||
# @return [Set]
|
||||
#
|
||||
HTML_VOID_ELEMENTS = Set.new([
|
||||
'area',
|
||||
'base',
|
||||
'br',
|
||||
'col',
|
||||
'command',
|
||||
'embed',
|
||||
'hr',
|
||||
'img',
|
||||
'input',
|
||||
'keygen',
|
||||
'link',
|
||||
'meta',
|
||||
'param',
|
||||
'source',
|
||||
'track',
|
||||
'wbr'
|
||||
])
|
||||
|
||||
##
|
||||
# @param [String] data The data to lex.
|
||||
#
|
||||
# @param [Hash] options
|
||||
#
|
||||
# @option options [Symbol] :html When set to `true` the lexer will treat
|
||||
# the input as HTML instead of SGML/XML. This makes it possible to lex
|
||||
# HTML void elements such as `<link href="">`.
|
||||
#
|
||||
def initialize(data, options = {})
|
||||
options.each do |key, value|
|
||||
instance_variable_set("@#{key}", value) if respond_to?(key)
|
||||
end
|
||||
|
||||
@data = data
|
||||
|
||||
reset
|
||||
end
|
||||
|
||||
##
|
||||
# Resets the internal state of the lexer. Typically you don't need to
|
||||
# call this method yourself as its called by #lex after lexing a given
|
||||
# String.
|
||||
#
|
||||
def reset
|
||||
@line = 1
|
||||
@elements = []
|
||||
end
|
||||
|
||||
##
|
||||
# Gathers all the tokens for the input and returns them as an Array.
|
||||
#
|
||||
# This method resets the internal state of the lexer after consuming the
|
||||
# input.
|
||||
#
|
||||
# @param [String] data The string to consume.
|
||||
# @return [Array]
|
||||
# @see #advance
|
||||
#
|
||||
def lex
|
||||
tokens = []
|
||||
|
||||
advance do |token|
|
||||
tokens << token
|
||||
end
|
||||
|
||||
reset
|
||||
|
||||
return tokens
|
||||
end
|
||||
|
||||
##
|
||||
# Advances through the input and generates the corresponding tokens. Each
|
||||
# token is yielded to the supplied block.
|
||||
#
|
||||
# Each token is an Array in the following format:
|
||||
#
|
||||
# [TYPE, VALUE]
|
||||
#
|
||||
# The type is a symbol, the value is either nil or a String.
|
||||
#
|
||||
# This method stores the supplied block in `@block` and resets it after
|
||||
# the lexer loop has finished.
|
||||
#
|
||||
# This method does *not* reset the internal state of the lexer.
|
||||
#
|
||||
#
|
||||
# @param [String] data The String to consume.
|
||||
# @return [Array]
|
||||
#
|
||||
def advance(&block)
|
||||
@block = block
|
||||
|
||||
advance_native
|
||||
ensure
|
||||
@block = nil
|
||||
end
|
||||
|
||||
##
|
||||
# @return [TrueClass|FalseClass]
|
||||
#
|
||||
def html?
|
||||
return !!html
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
##
|
||||
# @param [Fixnum] amount The amount of lines to advance.
|
||||
#
|
||||
def advance_line(amount = 1)
|
||||
@line += amount
|
||||
end
|
||||
|
||||
##
|
||||
# Adds a token with the given type and value to the list.
|
||||
#
|
||||
# @param [Symbol] type The token type.
|
||||
# @param [String] value The token value.
|
||||
#
|
||||
def add_token(type, value = nil)
|
||||
token = [type, value, @line]
|
||||
|
||||
@block.call(token)
|
||||
end
|
||||
|
||||
##
|
||||
# Returns the name of the element we're currently in.
|
||||
#
|
||||
# @return [String]
|
||||
#
|
||||
def current_element
|
||||
return @elements.last
|
||||
end
|
||||
|
||||
def on_string(value)
|
||||
add_token(:T_STRING, value)
|
||||
end
|
||||
|
||||
def on_start_doctype
|
||||
add_token(:T_DOCTYPE_START)
|
||||
end
|
||||
|
||||
def on_doctype_type(value)
|
||||
add_token(:T_DOCTYPE_TYPE, value)
|
||||
end
|
||||
|
||||
def on_doctype_name(value)
|
||||
add_token(:T_DOCTYPE_NAME, value)
|
||||
end
|
||||
|
||||
def on_doctype_end
|
||||
add_token(:T_DOCTYPE_END)
|
||||
end
|
||||
|
||||
def on_cdata_start
|
||||
add_token(:T_CDATA_START)
|
||||
end
|
||||
|
||||
def on_cdata_end
|
||||
add_token(:T_CDATA_END)
|
||||
end
|
||||
|
||||
def on_comment_start
|
||||
add_token(:T_COMMENT_START)
|
||||
end
|
||||
|
||||
def on_comment_end
|
||||
add_token(:T_COMMENT_END)
|
||||
end
|
||||
|
||||
def on_xml_decl_start
|
||||
add_token(:T_XML_DECL_START)
|
||||
end
|
||||
|
||||
def on_xml_decl_end
|
||||
add_token(:T_XML_DECL_END)
|
||||
end
|
||||
|
||||
def on_element_start(name)
|
||||
add_token(:T_ELEM_START)
|
||||
|
||||
if name.include?(':')
|
||||
ns, name = name.split(':')
|
||||
|
||||
add_token(:T_ELEM_NS, ns)
|
||||
end
|
||||
|
||||
@elements << name if html?
|
||||
|
||||
add_token(:T_ELEM_NAME, name)
|
||||
end
|
||||
|
||||
def on_element_open_end
|
||||
if html? and HTML_VOID_ELEMENTS.include?(current_element)
|
||||
add_token(:T_ELEM_END)
|
||||
@elements.pop
|
||||
end
|
||||
end
|
||||
|
||||
def on_element_end
|
||||
add_token(:T_ELEM_END)
|
||||
|
||||
@elements.pop if html?
|
||||
end
|
||||
|
||||
def on_text(value)
|
||||
unless value.empty?
|
||||
add_token(:T_TEXT, value)
|
||||
|
||||
lines = value.count("\n")
|
||||
|
||||
advance_line(lines) if lines > 0
|
||||
end
|
||||
end
|
||||
|
||||
def on_attribute(value)
|
||||
add_token(:T_ATTR, value)
|
||||
end
|
||||
|
||||
def on_newline
|
||||
@line += 1
|
||||
end
|
||||
end # Lexer
|
||||
end # XML
|
||||
end # Oga
|
|
@ -1,501 +0,0 @@
|
|||
%%machine lexer; # %
|
||||
|
||||
module Oga
|
||||
module XML
|
||||
##
|
||||
# Low level lexer that supports both XML and HTML (using an extra option).
|
||||
# To lex HTML input set the `:html` option to `true` when creating an
|
||||
# instance of the lexer:
|
||||
#
|
||||
# lexer = Oga::XML::Lexer.new(:html => true)
|
||||
#
|
||||
# @!attribute [r] html
|
||||
# @return [TrueClass|FalseClass]
|
||||
#
|
||||
# @!attribute [r] tokens
|
||||
# @return [Array]
|
||||
#
|
||||
class Lexer
|
||||
%% write data;
|
||||
|
||||
# % fix highlight
|
||||
|
||||
attr_reader :html
|
||||
|
||||
##
|
||||
# Names of the HTML void elements that should be handled when HTML lexing
|
||||
# is enabled.
|
||||
#
|
||||
# @return [Set]
|
||||
#
|
||||
HTML_VOID_ELEMENTS = Set.new([
|
||||
'area',
|
||||
'base',
|
||||
'br',
|
||||
'col',
|
||||
'command',
|
||||
'embed',
|
||||
'hr',
|
||||
'img',
|
||||
'input',
|
||||
'keygen',
|
||||
'link',
|
||||
'meta',
|
||||
'param',
|
||||
'source',
|
||||
'track',
|
||||
'wbr'
|
||||
])
|
||||
|
||||
##
|
||||
# @param [String] data The data to lex.
|
||||
#
|
||||
# @param [Hash] options
|
||||
#
|
||||
# @option options [Symbol] :html When set to `true` the lexer will treat
|
||||
# the input as HTML instead of SGML/XML. This makes it possible to lex
|
||||
# HTML void elements such as `<link href="">`.
|
||||
#
|
||||
def initialize(data, options = {})
|
||||
options.each do |key, value|
|
||||
instance_variable_set("@#{key}", value) if respond_to?(key)
|
||||
end
|
||||
|
||||
@data = data
|
||||
|
||||
reset
|
||||
end
|
||||
|
||||
##
|
||||
# Resets the internal state of the lexer. Typically you don't need to
|
||||
# call this method yourself as its called by #lex after lexing a given
|
||||
# String.
|
||||
#
|
||||
def reset
|
||||
@line = 1
|
||||
@elements = []
|
||||
|
||||
@buffer_start_position = nil
|
||||
end
|
||||
|
||||
##
|
||||
# Gathers all the tokens for the input and returns them as an Array.
|
||||
#
|
||||
# This method resets the internal state of the lexer after consuming the
|
||||
# input.
|
||||
#
|
||||
# @param [String] data The string to consume.
|
||||
# @return [Array]
|
||||
# @see #advance
|
||||
#
|
||||
def lex
|
||||
tokens = []
|
||||
|
||||
advance do |token|
|
||||
tokens << token
|
||||
end
|
||||
|
||||
reset
|
||||
|
||||
return tokens
|
||||
end
|
||||
|
||||
##
|
||||
# Advances through the input and generates the corresponding tokens. Each
|
||||
# token is yielded to the supplied block.
|
||||
#
|
||||
# Each token is an Array in the following format:
|
||||
#
|
||||
# [TYPE, VALUE]
|
||||
#
|
||||
# The type is a symbol, the value is either nil or a String.
|
||||
#
|
||||
# This method stores the supplied block in `@block` and resets it after
|
||||
# the lexer loop has finished.
|
||||
#
|
||||
# This method does *not* reset the internal state of the lexer.
|
||||
#
|
||||
#
|
||||
# @param [String] data The String to consume.
|
||||
# @return [Array]
|
||||
#
|
||||
def advance(&block)
|
||||
@block = block
|
||||
|
||||
data = @data
|
||||
ts = nil
|
||||
te = nil
|
||||
stack = []
|
||||
top = 0
|
||||
cs = self.class.lexer_start
|
||||
act = 0
|
||||
eof = @data.bytesize
|
||||
p = 0
|
||||
pe = eof
|
||||
|
||||
_lexer_eof_trans = self.class.send(:_lexer_eof_trans)
|
||||
_lexer_from_state_actions = self.class.send(:_lexer_from_state_actions)
|
||||
_lexer_index_offsets = self.class.send(:_lexer_index_offsets)
|
||||
_lexer_indicies = self.class.send(:_lexer_indicies)
|
||||
_lexer_key_spans = self.class.send(:_lexer_key_spans)
|
||||
_lexer_to_state_actions = self.class.send(:_lexer_to_state_actions)
|
||||
_lexer_trans_actions = self.class.send(:_lexer_trans_actions)
|
||||
_lexer_trans_keys = self.class.send(:_lexer_trans_keys)
|
||||
_lexer_trans_targs = self.class.send(:_lexer_trans_targs)
|
||||
|
||||
%% write exec;
|
||||
|
||||
# % fix highlight
|
||||
ensure
|
||||
@block = nil
|
||||
end
|
||||
|
||||
##
|
||||
# @return [TrueClass|FalseClass]
|
||||
#
|
||||
def html?
|
||||
return !!html
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
##
|
||||
# @param [Fixnum] amount The amount of lines to advance.
|
||||
#
|
||||
def advance_line(amount = 1)
|
||||
@line += amount
|
||||
end
|
||||
|
||||
##
|
||||
# Emits a token who's value is based on the supplied start/stop position.
|
||||
#
|
||||
# @param [Symbol] type The token type.
|
||||
# @param [Fixnum] start
|
||||
# @param [Fixnum] stop
|
||||
#
|
||||
# @see #text
|
||||
# @see #add_token
|
||||
#
|
||||
def emit(type, start, stop)
|
||||
value = text(start, stop)
|
||||
|
||||
add_token(type, value)
|
||||
end
|
||||
|
||||
##
|
||||
# Returns the text of the current buffer based on the supplied start and
|
||||
# stop position.
|
||||
#
|
||||
# @param [Fixnum] start
|
||||
# @param [Fixnum] stop
|
||||
# @return [String]
|
||||
#
|
||||
def text(start, stop)
|
||||
return @data.byteslice(start, stop - start)
|
||||
end
|
||||
|
||||
##
|
||||
# Adds a token with the given type and value to the list.
|
||||
#
|
||||
# @param [Symbol] type The token type.
|
||||
# @param [String] value The token value.
|
||||
#
|
||||
def add_token(type, value = nil)
|
||||
token = [type, value, @line]
|
||||
|
||||
@block.call(token)
|
||||
end
|
||||
|
||||
##
|
||||
# Enables buffering starting at the given position.
|
||||
#
|
||||
# @param [Fixnum] position The start position of the buffer.
|
||||
#
|
||||
def start_buffer(position)
|
||||
@buffer_start_position = position
|
||||
end
|
||||
|
||||
##
|
||||
# Emits a text token.
|
||||
#
|
||||
# @param [Fixnum] start
|
||||
# @param [Fixnum] stop
|
||||
#
|
||||
def emit_text(start, stop)
|
||||
content = text(start, stop)
|
||||
|
||||
unless content.empty?
|
||||
add_token(:T_TEXT, content)
|
||||
|
||||
lines = content.count("\n")
|
||||
|
||||
advance_line(lines) if lines > 0
|
||||
end
|
||||
end
|
||||
|
||||
##
|
||||
# Returns the name of the element we're currently in.
|
||||
#
|
||||
# @return [String]
|
||||
#
|
||||
def current_element
|
||||
return @elements.last
|
||||
end
|
||||
|
||||
%%{
|
||||
getkey (data.getbyte(p) || 0);
|
||||
|
||||
newline = '\n' | '\r\n';
|
||||
whitespace = [ \t];
|
||||
identifier = [a-zA-Z0-9\-_:]+;
|
||||
|
||||
# Strings
|
||||
#
|
||||
# Strings in HTML can either be single or double quoted. If a string
|
||||
# starts with one of these quotes it must be closed with the same type
|
||||
# of quote.
|
||||
dquote = '"';
|
||||
squote = "'";
|
||||
|
||||
# Machine for processing double quoted strings.
|
||||
string_dquote := |*
|
||||
^dquote+ => {
|
||||
emit(:T_STRING, ts, te)
|
||||
};
|
||||
|
||||
dquote => { fret; };
|
||||
*|;
|
||||
|
||||
# Machine for processing single quoted strings.
|
||||
string_squote := |*
|
||||
^squote+ => {
|
||||
emit(:T_STRING, ts, te)
|
||||
};
|
||||
|
||||
squote => { fret; };
|
||||
*|;
|
||||
|
||||
# DOCTYPES
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
|
||||
#
|
||||
# These rules support the 3 flavours of doctypes:
|
||||
#
|
||||
# 1. Normal doctypes, as introduced in the HTML5 specification.
|
||||
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
|
||||
# 3. Legacy doctypes
|
||||
#
|
||||
doctype_start = '<!DOCTYPE'i whitespace+;
|
||||
|
||||
action start_doctype {
|
||||
add_token(:T_DOCTYPE_START)
|
||||
fcall doctype;
|
||||
}
|
||||
|
||||
# Machine for processing doctypes. Doctype values such as the public
|
||||
# and system IDs are treated as T_STRING tokens.
|
||||
doctype := |*
|
||||
'PUBLIC' | 'SYSTEM' => { emit(:T_DOCTYPE_TYPE, ts, te) };
|
||||
|
||||
# Lex the public/system IDs as regular strings.
|
||||
dquote => { fcall string_dquote; };
|
||||
squote => { fcall string_squote; };
|
||||
|
||||
# Whitespace inside doctypes is ignored since there's no point in
|
||||
# including it.
|
||||
whitespace;
|
||||
|
||||
identifier => { emit(:T_DOCTYPE_NAME, ts, te) };
|
||||
|
||||
'>' => {
|
||||
add_token(:T_DOCTYPE_END)
|
||||
fret;
|
||||
};
|
||||
*|;
|
||||
|
||||
# CDATA
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
|
||||
#
|
||||
# CDATA tags are broken up into 3 parts: the start, the content and the
|
||||
# end tag.
|
||||
#
|
||||
# In HTML CDATA tags have no meaning/are not supported. Oga does
|
||||
# support them but treats their contents as plain text.
|
||||
#
|
||||
cdata_start = '<![CDATA[';
|
||||
cdata_end = ']]>';
|
||||
|
||||
action start_cdata {
|
||||
add_token(:T_CDATA_START)
|
||||
|
||||
fcall cdata;
|
||||
}
|
||||
|
||||
# Machine that for processing the contents of CDATA tags. Everything
|
||||
# inside a CDATA tag is treated as plain text.
|
||||
cdata := |*
|
||||
any* cdata_end => {
|
||||
emit_text(ts, te - 3)
|
||||
add_token(:T_CDATA_END)
|
||||
|
||||
fret;
|
||||
};
|
||||
*|;
|
||||
|
||||
# Comments
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#comments
|
||||
#
|
||||
# Comments are lexed into 3 parts: the start tag, the content and the
|
||||
# end tag.
|
||||
#
|
||||
# Unlike the W3 specification these rules *do* allow character
|
||||
# sequences such as `--` and `->`. Putting extra checks in for these
|
||||
# sequences would actually make the rules/actions more complex.
|
||||
#
|
||||
comment_start = '<!--';
|
||||
comment_end = '-->';
|
||||
|
||||
action start_comment {
|
||||
add_token(:T_COMMENT_START)
|
||||
|
||||
fcall comment;
|
||||
}
|
||||
|
||||
# Machine used for processing the contents of a comment. Everything
|
||||
# inside a comment is treated as plain text (similar to CDATA tags).
|
||||
comment := |*
|
||||
any* comment_end => {
|
||||
emit_text(ts, te - 3)
|
||||
add_token(:T_COMMENT_END)
|
||||
|
||||
fret;
|
||||
};
|
||||
*|;
|
||||
|
||||
# XML declaration tags
|
||||
#
|
||||
# http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
|
||||
#
|
||||
xml_decl_start = '<?xml';
|
||||
xml_decl_end = '?>';
|
||||
|
||||
action start_xml_decl {
|
||||
add_token(:T_XML_DECL_START)
|
||||
|
||||
fcall xml_decl;
|
||||
}
|
||||
|
||||
# Machine that processes the contents of an XML declaration tag.
|
||||
xml_decl := |*
|
||||
xml_decl_end => {
|
||||
add_token(:T_XML_DECL_END)
|
||||
|
||||
fret;
|
||||
};
|
||||
|
||||
# Attributes and their values (e.g. version="1.0").
|
||||
identifier => { emit(:T_ATTR, ts, te) };
|
||||
|
||||
dquote => { fcall string_dquote; };
|
||||
squote => { fcall string_squote; };
|
||||
|
||||
any;
|
||||
*|;
|
||||
|
||||
# Elements
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
|
||||
#
|
||||
|
||||
# Action that creates the tokens for the opening tag, name and
|
||||
# namespace (if any). Remaining work is delegated to a dedicated
|
||||
# machine.
|
||||
action start_element {
|
||||
add_token(:T_ELEM_START)
|
||||
|
||||
# Add the element name. If the name includes a namespace we'll break
|
||||
# the name up into two separate tokens.
|
||||
name = text(ts + 1, te)
|
||||
|
||||
if name.include?(':')
|
||||
ns, name = name.split(':')
|
||||
|
||||
add_token(:T_ELEM_NS, ns)
|
||||
end
|
||||
|
||||
@elements << name if html?
|
||||
|
||||
add_token(:T_ELEM_NAME, name)
|
||||
|
||||
fcall element_head;
|
||||
}
|
||||
|
||||
element_start = '<' identifier;
|
||||
|
||||
# Machine used for processing the characters inside a element head. An
|
||||
# element head is everything between `<NAME` (where NAME is the element
|
||||
# name) and `>`.
|
||||
#
|
||||
# For example, in `<p foo="bar">` the element head is ` foo="bar"`.
|
||||
#
|
||||
element_head := |*
|
||||
whitespace | '=';
|
||||
|
||||
newline => { advance_line };
|
||||
|
||||
# Attribute names.
|
||||
identifier => { emit(:T_ATTR, ts, te) };
|
||||
|
||||
# Attribute values.
|
||||
dquote => { fcall string_dquote; };
|
||||
squote => { fcall string_squote; };
|
||||
|
||||
# The closing character of the open tag.
|
||||
('>' | '/') => {
|
||||
fhold;
|
||||
fret;
|
||||
};
|
||||
*|;
|
||||
|
||||
main := |*
|
||||
element_start => start_element;
|
||||
doctype_start => start_doctype;
|
||||
cdata_start => start_cdata;
|
||||
comment_start => start_comment;
|
||||
xml_decl_start => start_xml_decl;
|
||||
|
||||
# Enter the body of the tag. If HTML mode is enabled and the current
|
||||
# element is a void element we'll close it and bail out.
|
||||
'>' => {
|
||||
if html? and HTML_VOID_ELEMENTS.include?(current_element)
|
||||
add_token(:T_ELEM_END, nil)
|
||||
@elements.pop
|
||||
end
|
||||
};
|
||||
|
||||
# Regular closing tags.
|
||||
'</' identifier '>' => {
|
||||
add_token(:T_ELEM_END, nil)
|
||||
|
||||
@elements.pop if html?
|
||||
};
|
||||
|
||||
# Self closing elements that are not handled by the HTML mode.
|
||||
'/>' => {
|
||||
add_token(:T_ELEM_END, nil)
|
||||
|
||||
@elements.pop if html?
|
||||
};
|
||||
|
||||
# Note that this rule should be declared at the very bottom as it
|
||||
# will otherwise take precedence over the other rules.
|
||||
^('<' | '>')+ => {
|
||||
emit_text(ts, te)
|
||||
};
|
||||
*|;
|
||||
}%%
|
||||
end # Lexer
|
||||
end # XML
|
||||
end # Oga
|
|
@ -12,6 +12,8 @@ Gem::Specification.new do |s|
|
|||
|
||||
s.files = File.read(File.expand_path('../MANIFEST', __FILE__)).split("\n")
|
||||
|
||||
s.extensions = ['ext/liboga/extconf.rb']
|
||||
|
||||
s.has_rdoc = 'yard'
|
||||
s.required_ruby_version = '>= 1.9.3'
|
||||
|
||||
|
@ -24,4 +26,5 @@ Gem::Specification.new do |s|
|
|||
s.add_development_dependency 'simplecov'
|
||||
s.add_development_dependency 'kramdown'
|
||||
s.add_development_dependency 'benchmark-ips'
|
||||
s.add_development_dependency 'rake-compiler'
|
||||
end
|
||||
|
|
|
@ -18,5 +18,11 @@ rule '.rb' => '.rl' do |task|
|
|||
end
|
||||
end
|
||||
|
||||
desc 'Generates the lexer'
|
||||
task :lexer => [LEXER_OUTPUT]
|
||||
rule '.c' => '.rl' do |task|
|
||||
Cliver.assert('ragel', '~> 6.7')
|
||||
|
||||
sh "ragel -C -G2 #{task.source} -o #{task.name}"
|
||||
end
|
||||
|
||||
desc 'Generates the lexers'
|
||||
task :lexer => ['ext/liboga/lexer.c']
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
desc 'Runs the tests'
|
||||
task :test => [:generate] do
|
||||
task :test => [:generate, :compile] do
|
||||
sh 'rspec spec'
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue