Namespaced the lexer/parser under Oga::XML.

With the upcoming XPath and CSS selector lexers/parsers it will be confusing to
keep these in the root namespace.
This commit is contained in:
Yorick Peterse 2014-03-25 09:34:38 +01:00
parent 2259061c89
commit eae13d21ed
28 changed files with 2049 additions and 537 deletions

View File

@ -5,10 +5,10 @@ require 'cliver'
GEMSPEC = Gem::Specification.load('oga.gemspec')
LEXER_INPUT = 'lib/oga/lexer.rl'
LEXER_OUTPUT = 'lib/oga/lexer.rb'
LEXER_INPUT = 'lib/oga/xml/lexer.rl'
LEXER_OUTPUT = 'lib/oga/xml/lexer.rb'
HTML_PARSER = 'lib/oga/parser.rb'
HTML_PARSER = 'lib/oga/xml/parser.rb'
GENERATED_FILES = ['coverage', 'yardoc', LEXER_OUTPUT, HTML_PARSER]

View File

@ -5,7 +5,7 @@ string = 'Hello, how are you doing today?'
small = "<![CDATA[#{string}]]>"
medium = "<![CDATA[#{string * 1_000}]]>"
large = "<![CDATA[#{string * 10_000}]]>"
lexer = Oga::Lexer.new
lexer = Oga::XML::Lexer.new
Benchmark.ips do |bench|
bench.report 'CDATA with a small body' do

View File

@ -4,7 +4,7 @@ require 'benchmark/ips'
simple = '<p>Hello world</p>'
attributes = '<p class="foo">Hello world</p>'
nested = '<p>Hello<strong>world</strong></p>'
lexer = Oga::Lexer.new
lexer = Oga::XML::Lexer.new
Benchmark.ips do |bench|
bench.report 'text only' do

View File

@ -2,7 +2,7 @@ require_relative '../../lib/oga'
require 'benchmark/ips'
html = File.read(File.expand_path('../../fixtures/hrs.html', __FILE__))
lexer = Oga::Lexer.new(:html => true)
lexer = Oga::XML::Lexer.new(:html => true)
Benchmark.ips do |bench|
bench.report 'lex HTML' do

View File

@ -2,7 +2,7 @@ require_relative '../../lib/oga'
require 'benchmark'
html = File.read(File.expand_path('../../fixtures/hrs.html', __FILE__))
lexer = Oga::Lexer.new(:html => true)
lexer = Oga::XML::Lexer.new(:html => true)
Benchmark.bmbm(20) do |bench|
bench.report 'lex HTML' do

View File

@ -1,5 +1,5 @@
require 'ast'
require_relative 'oga/ast/node'
require_relative 'oga/lexer'
require_relative 'oga/parser'
require_relative 'oga/xml/lexer'
require_relative 'oga/xml/parser'

View File

@ -1,508 +0,0 @@
%%machine lexer; # %
module Oga
##
# Low level lexer that supports both XML and HTML (using an extra option). To
# lex HTML input set the `:html` option to `true` when creating an instance
# of the lexer:
#
# lexer = Oga::Lexer.new(:html => true)
#
# @!attribute [r] html
# @return [TrueClass|FalseClass]
#
class Lexer
%% write data; # %
attr_reader :html
##
# Names of the HTML void elements that should be handled when HTML lexing
# is enabled.
#
# @return [Array]
#
HTML_VOID_ELEMENTS = [
'area',
'base',
'br',
'col',
'command',
'embed',
'hr',
'img',
'input',
'keygen',
'link',
'meta',
'param',
'source',
'track',
'wbr'
]
# Lazy way of forwarding instance method calls used internally by Ragel to
# their corresponding class methods.
private_methods.grep(/^_lexer_/).each do |name|
define_method(name) do
return self.class.send(name)
end
private(name)
end
##
# @param [Hash] options
#
# @option options [Symbol] :html When set to `true` the lexer will treat
# the input as HTML instead of SGML/XML. This makes it possible to lex
# HTML void elements such as `<link href="">`.
#
def initialize(options = {})
options.each do |key, value|
instance_variable_set("@#{key}", value) if respond_to?(key)
end
reset
end
##
# Resets the internal state of the lexer. Typically you don't need to call
# this method yourself as its called by #lex after lexing a given String.
#
def reset
@line = 1
@data = nil
@ts = nil
@te = nil
@tokens = []
@stack = []
@top = 0
@elements = []
@buffer_start_position = nil
end
##
# Lexes the supplied String and returns an Array of tokens. Each token is
# an Array in the following format:
#
# [TYPE, VALUE]
#
# The type is a symbol, the value is either nil or a String.
#
# @param [String] data The string to lex.
# @return [Array]
#
def lex(data)
@data = data.unpack('U*')
lexer_start = self.class.lexer_start
eof = data.length
%% write init;
%% write exec;
tokens = @tokens
reset
return tokens
end
##
# @return [TrueClass|FalseClass]
#
def html?
return !!html
end
private
##
# @param [Fixnum] amount The amount of lines to advance.
#
def advance_line(amount = 1)
@line += amount
end
##
# Emits a token who's value is based on the supplied start/stop position.
#
# @param [Symbol] type The token type.
# @param [Fixnum] start
# @param [Fixnum] stop
#
# @see #text
# @see #add_token
#
def t(type, start = @ts, stop = @te)
value = text(start, stop)
add_token(type, value)
end
##
# Returns the text of the current buffer based on the supplied start and
# stop position.
#
# By default `@ts` and `@te` are used as the start/stop position.
#
# @param [Fixnum] start
# @param [Fixnum] stop
# @return [String]
#
def text(start = @ts, stop = @te)
return @data[start...stop].pack('U*')
end
##
# Adds a token with the given type and value to the list.
#
# @param [Symbol] type The token type.
# @param [String] value The token value.
#
def add_token(type, value = nil)
token = [type, value, @line]
@tokens << token
end
##
# Enables buffering starting at the given position.
#
# @param [Fixnum] position The start position of the buffer, set to `@te`
# by default.
#
def start_buffer(position = @te)
@buffer_start_position = position
end
##
# Returns `true` if we're currently buffering.
#
# @return [TrueClass|FalseClass]
#
def buffering?
return !!@buffer_start_position
end
##
# Emits the current buffer if we have any. The current line number is
# advanced based on the amount of newlines in the buffer.
#
# @param [Fixnum] position The end position of the buffer, set to `@ts` by
# default.
#
# @param [Symbol] type The type of node to emit.
#
def emit_buffer(position = @ts, type = :T_TEXT)
return unless @buffer_start_position
content = text(@buffer_start_position, position)
unless content.empty?
add_token(type, content)
lines = content.count("\n")
advance_line(lines) if lines > 0
end
@buffer_start_position = nil
end
##
# Returns the name of the element we're currently in.
#
# @return [String]
#
def current_element
return @elements.last
end
%%{
# Use instance variables for `ts` and friends.
access @;
getkey (@data[p] || 0);
newline = '\n' | '\r\n';
whitespace = [ \t];
# Strings
#
# Strings in HTML can either be single or double quoted. If a string
# starts with one of these quotes it must be closed with the same type of
# quote.
dquote = '"';
squote = "'";
action start_string_dquote {
start_buffer
fcall string_dquote;
}
action start_string_squote {
start_buffer
fcall string_squote;
}
# Machine for processing double quoted strings.
string_dquote := |*
dquote => {
emit_buffer(@ts, :T_STRING)
fret;
};
any;
*|;
# Machine for processing single quoted strings.
string_squote := |*
squote => {
emit_buffer(@ts, :T_STRING)
fret;
};
any;
*|;
# DOCTYPES
#
# http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
#
# These rules support the 3 flavours of doctypes:
#
# 1. Normal doctypes, as introduced in the HTML5 specification.
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
# 3. Legacy doctypes
#
doctype_start = '<!DOCTYPE'i whitespace+ 'HTML'i;
action start_doctype {
emit_buffer
add_token(:T_DOCTYPE_START)
fcall doctype;
}
# Machine for processing doctypes. Doctype values such as the public and
# system IDs are treated as T_STRING tokens.
doctype := |*
'PUBLIC' | 'SYSTEM' => { t(:T_DOCTYPE_TYPE) };
# Lex the public/system IDs as regular strings.
dquote => start_string_dquote;
squote => start_string_squote;
# Whitespace inside doctypes is ignored since there's no point in
# including it.
whitespace;
'>' => {
add_token(:T_DOCTYPE_END)
fret;
};
*|;
# CDATA
#
# http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
#
# CDATA tags are broken up into 3 parts: the start, the content and the
# end tag.
#
# In HTML CDATA tags have no meaning/are not supported. Oga does
# support them but treats their contents as plain text.
#
cdata_start = '<![CDATA[';
cdata_end = ']]>';
action start_cdata {
emit_buffer
add_token(:T_CDATA_START)
start_buffer
fcall cdata;
}
# Machine that for processing the contents of CDATA tags. Everything
# inside a CDATA tag is treated as plain text.
cdata := |*
cdata_end => {
emit_buffer
add_token(:T_CDATA_END)
fret;
};
any;
*|;
# Comments
#
# http://www.w3.org/TR/html-markup/syntax.html#comments
#
# Comments are lexed into 3 parts: the start tag, the content and the end
# tag.
#
# Unlike the W3 specification these rules *do* allow character sequences
# such as `--` and `->`. Putting extra checks in for these sequences
# would actually make the rules/actions more complex.
#
comment_start = '<!--';
comment_end = '-->';
action start_comment {
emit_buffer
add_token(:T_COMMENT_START)
start_buffer
fcall comment;
}
# Machine used for processing the contents of a comment. Everything
# inside a comment is treated as plain text (similar to CDATA tags).
comment := |*
comment_end => {
emit_buffer
add_token(:T_COMMENT_END)
fret;
};
any;
*|;
# XML declaration tags
#
# http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
#
xml_decl_start = '<?xml';
xml_decl_end = '?>';
action start_xml_decl {
emit_buffer
add_token(:T_XML_DECL_START)
start_buffer
fcall xml_decl;
}
# Machine that processes the contents of an XML declaration tag.
xml_decl := |*
xml_decl_end => {
emit_buffer
add_token(:T_XML_DECL_END)
fret;
};
any;
*|;
# Elements
#
# http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
#
# Action that creates the tokens for the opening tag, name and namespace
# (if any). Remaining work is delegated to a dedicated machine.
action start_element {
emit_buffer
add_token(:T_ELEM_START)
# Add the element name. If the name includes a namespace we'll break
# the name up into two separate tokens.
name = text(@ts + 1)
if name.include?(':')
ns, name = name.split(':')
add_token(:T_ELEM_NS, ns)
end
@elements << name
add_token(:T_ELEM_NAME, name)
fcall element_head;
}
element_name = [a-zA-Z0-9\-_:]+;
element_start = '<' element_name;
# Machine used for processing the characters inside a element head. An
# element head is everything between `<NAME` (where NAME is the element
# name) and `>`.
#
# For example, in `<p foo="bar">` the element head is ` foo="bar"`.
#
element_head := |*
whitespace | '=';
newline => { advance_line };
# Attribute names.
element_name => { t(:T_ATTR) };
# Attribute values.
dquote => start_string_dquote;
squote => start_string_squote;
# The closing character of the open tag.
('>' | '/') => {
fhold;
fret;
};
*|;
main := |*
element_start => start_element;
doctype_start => start_doctype;
cdata_start => start_cdata;
comment_start => start_comment;
xml_decl_start => start_xml_decl;
# Enter the body of the tag. If HTML mode is enabled and the current
# element is a void element we'll close it and bail out.
'>' => {
if html? and HTML_VOID_ELEMENTS.include?(current_element)
add_token(:T_ELEM_END, nil)
@elements.pop
end
};
# Regular closing tags.
'</' element_name '>' => {
emit_buffer
add_token(:T_ELEM_END, nil)
@elements.pop
};
# Self closing elements that are not handled by the HTML mode.
'/>' => {
add_token(:T_ELEM_END, nil)
@elements.pop
};
# Note that this rule should be declared at the very bottom as it will
# otherwise take precedence over the other rules.
any => {
# First character, start buffering (unless we already are buffering).
start_buffer(@ts) unless buffering?
# EOF, emit the text buffer.
if @te == eof
emit_buffer(@te)
end
};
*|;
}%%
end # Lexer
end # Oga

1108
lib/oga/xml/lexer.rb Normal file

File diff suppressed because it is too large Load Diff

510
lib/oga/xml/lexer.rl Normal file
View File

@ -0,0 +1,510 @@
%%machine lexer; # %
module Oga
module XML
##
# Low level lexer that supports both XML and HTML (using an extra option). To
# lex HTML input set the `:html` option to `true` when creating an instance
# of the lexer:
#
# lexer = Oga::Lexer.new(:html => true)
#
# @!attribute [r] html
# @return [TrueClass|FalseClass]
#
class Lexer
%% write data; # %
attr_reader :html
##
# Names of the HTML void elements that should be handled when HTML lexing
# is enabled.
#
# @return [Array]
#
HTML_VOID_ELEMENTS = [
'area',
'base',
'br',
'col',
'command',
'embed',
'hr',
'img',
'input',
'keygen',
'link',
'meta',
'param',
'source',
'track',
'wbr'
]
# Lazy way of forwarding instance method calls used internally by Ragel to
# their corresponding class methods.
private_methods.grep(/^_lexer_/).each do |name|
define_method(name) do
return self.class.send(name)
end
private(name)
end
##
# @param [Hash] options
#
# @option options [Symbol] :html When set to `true` the lexer will treat
# the input as HTML instead of SGML/XML. This makes it possible to lex
# HTML void elements such as `<link href="">`.
#
def initialize(options = {})
options.each do |key, value|
instance_variable_set("@#{key}", value) if respond_to?(key)
end
reset
end
##
# Resets the internal state of the lexer. Typically you don't need to call
# this method yourself as its called by #lex after lexing a given String.
#
def reset
@line = 1
@data = nil
@ts = nil
@te = nil
@tokens = []
@stack = []
@top = 0
@elements = []
@buffer_start_position = nil
end
##
# Lexes the supplied String and returns an Array of tokens. Each token is
# an Array in the following format:
#
# [TYPE, VALUE]
#
# The type is a symbol, the value is either nil or a String.
#
# @param [String] data The string to lex.
# @return [Array]
#
def lex(data)
@data = data.unpack('U*')
lexer_start = self.class.lexer_start
eof = data.length
%% write init;
%% write exec;
tokens = @tokens
reset
return tokens
end
##
# @return [TrueClass|FalseClass]
#
def html?
return !!html
end
private
##
# @param [Fixnum] amount The amount of lines to advance.
#
def advance_line(amount = 1)
@line += amount
end
##
# Emits a token who's value is based on the supplied start/stop position.
#
# @param [Symbol] type The token type.
# @param [Fixnum] start
# @param [Fixnum] stop
#
# @see #text
# @see #add_token
#
def t(type, start = @ts, stop = @te)
value = text(start, stop)
add_token(type, value)
end
##
# Returns the text of the current buffer based on the supplied start and
# stop position.
#
# By default `@ts` and `@te` are used as the start/stop position.
#
# @param [Fixnum] start
# @param [Fixnum] stop
# @return [String]
#
def text(start = @ts, stop = @te)
return @data[start...stop].pack('U*')
end
##
# Adds a token with the given type and value to the list.
#
# @param [Symbol] type The token type.
# @param [String] value The token value.
#
def add_token(type, value = nil)
token = [type, value, @line]
@tokens << token
end
##
# Enables buffering starting at the given position.
#
# @param [Fixnum] position The start position of the buffer, set to `@te`
# by default.
#
def start_buffer(position = @te)
@buffer_start_position = position
end
##
# Returns `true` if we're currently buffering.
#
# @return [TrueClass|FalseClass]
#
def buffering?
return !!@buffer_start_position
end
##
# Emits the current buffer if we have any. The current line number is
# advanced based on the amount of newlines in the buffer.
#
# @param [Fixnum] position The end position of the buffer, set to `@ts` by
# default.
#
# @param [Symbol] type The type of node to emit.
#
def emit_buffer(position = @ts, type = :T_TEXT)
return unless @buffer_start_position
content = text(@buffer_start_position, position)
unless content.empty?
add_token(type, content)
lines = content.count("\n")
advance_line(lines) if lines > 0
end
@buffer_start_position = nil
end
##
# Returns the name of the element we're currently in.
#
# @return [String]
#
def current_element
return @elements.last
end
%%{
# Use instance variables for `ts` and friends.
access @;
getkey (@data[p] || 0);
newline = '\n' | '\r\n';
whitespace = [ \t];
# Strings
#
# Strings in HTML can either be single or double quoted. If a string
# starts with one of these quotes it must be closed with the same type of
# quote.
dquote = '"';
squote = "'";
action start_string_dquote {
start_buffer
fcall string_dquote;
}
action start_string_squote {
start_buffer
fcall string_squote;
}
# Machine for processing double quoted strings.
string_dquote := |*
dquote => {
emit_buffer(@ts, :T_STRING)
fret;
};
any;
*|;
# Machine for processing single quoted strings.
string_squote := |*
squote => {
emit_buffer(@ts, :T_STRING)
fret;
};
any;
*|;
# DOCTYPES
#
# http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
#
# These rules support the 3 flavours of doctypes:
#
# 1. Normal doctypes, as introduced in the HTML5 specification.
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
# 3. Legacy doctypes
#
doctype_start = '<!DOCTYPE'i whitespace+ 'HTML'i;
action start_doctype {
emit_buffer
add_token(:T_DOCTYPE_START)
fcall doctype;
}
# Machine for processing doctypes. Doctype values such as the public and
# system IDs are treated as T_STRING tokens.
doctype := |*
'PUBLIC' | 'SYSTEM' => { t(:T_DOCTYPE_TYPE) };
# Lex the public/system IDs as regular strings.
dquote => start_string_dquote;
squote => start_string_squote;
# Whitespace inside doctypes is ignored since there's no point in
# including it.
whitespace;
'>' => {
add_token(:T_DOCTYPE_END)
fret;
};
*|;
# CDATA
#
# http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
#
# CDATA tags are broken up into 3 parts: the start, the content and the
# end tag.
#
# In HTML CDATA tags have no meaning/are not supported. Oga does
# support them but treats their contents as plain text.
#
cdata_start = '<![CDATA[';
cdata_end = ']]>';
action start_cdata {
emit_buffer
add_token(:T_CDATA_START)
start_buffer
fcall cdata;
}
# Machine that for processing the contents of CDATA tags. Everything
# inside a CDATA tag is treated as plain text.
cdata := |*
cdata_end => {
emit_buffer
add_token(:T_CDATA_END)
fret;
};
any;
*|;
# Comments
#
# http://www.w3.org/TR/html-markup/syntax.html#comments
#
# Comments are lexed into 3 parts: the start tag, the content and the end
# tag.
#
# Unlike the W3 specification these rules *do* allow character sequences
# such as `--` and `->`. Putting extra checks in for these sequences
# would actually make the rules/actions more complex.
#
comment_start = '<!--';
comment_end = '-->';
action start_comment {
emit_buffer
add_token(:T_COMMENT_START)
start_buffer
fcall comment;
}
# Machine used for processing the contents of a comment. Everything
# inside a comment is treated as plain text (similar to CDATA tags).
comment := |*
comment_end => {
emit_buffer
add_token(:T_COMMENT_END)
fret;
};
any;
*|;
# XML declaration tags
#
# http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
#
xml_decl_start = '<?xml';
xml_decl_end = '?>';
action start_xml_decl {
emit_buffer
add_token(:T_XML_DECL_START)
start_buffer
fcall xml_decl;
}
# Machine that processes the contents of an XML declaration tag.
xml_decl := |*
xml_decl_end => {
emit_buffer
add_token(:T_XML_DECL_END)
fret;
};
any;
*|;
# Elements
#
# http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
#
# Action that creates the tokens for the opening tag, name and namespace
# (if any). Remaining work is delegated to a dedicated machine.
action start_element {
emit_buffer
add_token(:T_ELEM_START)
# Add the element name. If the name includes a namespace we'll break
# the name up into two separate tokens.
name = text(@ts + 1)
if name.include?(':')
ns, name = name.split(':')
add_token(:T_ELEM_NS, ns)
end
@elements << name
add_token(:T_ELEM_NAME, name)
fcall element_head;
}
element_name = [a-zA-Z0-9\-_:]+;
element_start = '<' element_name;
# Machine used for processing the characters inside a element head. An
# element head is everything between `<NAME` (where NAME is the element
# name) and `>`.
#
# For example, in `<p foo="bar">` the element head is ` foo="bar"`.
#
element_head := |*
whitespace | '=';
newline => { advance_line };
# Attribute names.
element_name => { t(:T_ATTR) };
# Attribute values.
dquote => start_string_dquote;
squote => start_string_squote;
# The closing character of the open tag.
('>' | '/') => {
fhold;
fret;
};
*|;
main := |*
element_start => start_element;
doctype_start => start_doctype;
cdata_start => start_cdata;
comment_start => start_comment;
xml_decl_start => start_xml_decl;
# Enter the body of the tag. If HTML mode is enabled and the current
# element is a void element we'll close it and bail out.
'>' => {
if html? and HTML_VOID_ELEMENTS.include?(current_element)
add_token(:T_ELEM_END, nil)
@elements.pop
end
};
# Regular closing tags.
'</' element_name '>' => {
emit_buffer
add_token(:T_ELEM_END, nil)
@elements.pop
};
# Self closing elements that are not handled by the HTML mode.
'/>' => {
add_token(:T_ELEM_END, nil)
@elements.pop
};
# Note that this rule should be declared at the very bottom as it will
# otherwise take precedence over the other rules.
any => {
# First character, start buffering (unless we already are buffering).
start_buffer(@ts) unless buffering?
# EOF, emit the text buffer.
if @te == eof
emit_buffer(@te)
end
};
*|;
}%%
end # Lexer
end # XML
end # Oga

402
lib/oga/xml/parser.rb Normal file
View File

@ -0,0 +1,402 @@
#
# DO NOT MODIFY!!!!
# This file is automatically generated by Racc 1.4.11
# from Racc grammer file "".
#
require 'racc/parser.rb'
module Oga
module XML
class Parser < Racc::Parser
##
# @param [Hash] options
#
# @option options [TrueClass|FalseClass] :html Enables HTML parsing mode.
# @see Oga::Lexer#initialize
#
def initialize(options = {})
@lexer = Lexer.new(options)
end
##
# Resets the internal state of the parser.
#
def reset
@lines = []
@line = 1
end
##
# Emits a new AST token.
#
# @param [Symbol] type
# @param [Array] children
#
def s(type, *children)
return AST::Node.new(
type,
children.flatten,
:line => @line
)
end
##
# Returns the next token from the lexer.
#
# @return [Array]
#
def next_token
type, value, line = @tokens.shift
@line = line if line
return type ? [type, value] : [false, false]
end
##
# @param [Fixnum] type The type of token the error occured on.
# @param [String] value The value of the token.
# @param [Array] stack The current stack of parsed nodes.
# @raise [Racc::ParseError]
#
def on_error(type, value, stack)
name = token_to_str(type)
index = @line - 1
lines = ''
# Show up to 5 lines before and after the offending line (if they exist).
(-5..5).each do |offset|
line = @lines[index + offset]
number = @line + offset
if line and number > 0
if offset == 0
prefix = '=> '
else
prefix = ' '
end
lines << "#{prefix}#{number}: #{line.strip}\n"
end
end
raise Racc::ParseError, <<-EOF
Unexpected #{name} with value #{value.inspect} on line #{@line}:
#{lines}
EOF
end
##
# Parses the supplied string and returns the AST.
#
# @example
# parser = Oga::Parser.new
# ast = parser.parse('<foo>bar</foo>')
#
# @param [String] string
# @return [Oga::AST::Node]
#
def parse(string)
@lines = string.lines
@tokens = @lexer.lex(string)
ast = do_parse
reset
return ast
end
# vim: set ft=racc:
##### State transition tables begin ###
racc_action_table = [
16, 40, 16, 10, 24, 37, 11, 22, 12, 28,
14, 23, 21, 45, 31, 15, 16, 10, 44, 28,
11, 43, 12, 36, 14, 35, 16, 10, 34, 15,
11, 41, 12, 42, 14, 33, 16, 10, 17, 15,
11, 46, 12, nil, 14, 29, 30, 19, 20, 15 ]
racc_action_check = [
15, 28, 38, 38, 12, 24, 38, 11, 38, 13,
38, 12, 11, 38, 15, 38, 2, 2, 35, 26,
2, 35, 2, 22, 2, 20, 25, 25, 20, 2,
25, 30, 25, 32, 25, 17, 0, 0, 1, 25,
0, 44, 0, nil, 0, 14, 14, 10, 10, 0 ]
racc_action_pointer = [
33, 38, 13, nil, nil, nil, nil, nil, nil, nil,
42, 4, 1, -6, 33, -3, nil, 35, nil, nil,
23, nil, 15, nil, -5, 23, 4, nil, -1, nil,
19, nil, 16, nil, nil, 16, nil, nil, -1, nil,
nil, nil, nil, nil, 36, nil, nil ]
racc_action_default = [
-2, -32, -1, -4, -6, -7, -8, -9, -10, -11,
-32, -32, -32, -24, -32, -32, -31, -32, -3, -12,
-32, -16, -32, -18, -32, -5, -23, -26, -27, -21,
-32, -29, -32, 47, -13, -32, -17, -19, -32, -25,
-28, -22, -30, -14, -32, -20, -15 ]
racc_goto_table = [
18, 2, 27, 32, 25, 26, 1, nil, nil, nil,
nil, nil, nil, nil, nil, 39, nil, nil, nil, nil,
nil, nil, nil, nil, nil, nil, 38, nil, nil, nil,
nil, nil, nil, nil, nil, nil, 18 ]
racc_goto_check = [
3, 2, 13, 8, 11, 12, 1, nil, nil, nil,
nil, nil, nil, nil, nil, 13, nil, nil, nil, nil,
nil, nil, nil, nil, nil, nil, 2, nil, nil, nil,
nil, nil, nil, nil, nil, nil, 3 ]
racc_goto_pointer = [
nil, 6, 1, -2, nil, nil, nil, nil, -12, nil,
nil, -9, -8, -11 ]
racc_goto_default = [
nil, nil, nil, 3, 4, 5, 6, 7, 8, 9,
13, nil, nil, nil ]
racc_reduce_table = [
0, 0, :racc_error,
1, 19, :_reduce_1,
0, 19, :_reduce_2,
2, 20, :_reduce_3,
1, 20, :_reduce_4,
0, 20, :_reduce_5,
1, 21, :_reduce_none,
1, 21, :_reduce_none,
1, 21, :_reduce_none,
1, 21, :_reduce_none,
1, 21, :_reduce_none,
1, 21, :_reduce_none,
2, 22, :_reduce_12,
3, 22, :_reduce_13,
4, 22, :_reduce_14,
5, 22, :_reduce_15,
2, 23, :_reduce_16,
3, 23, :_reduce_17,
2, 24, :_reduce_18,
3, 24, :_reduce_19,
4, 25, :_reduce_20,
2, 28, :_reduce_21,
3, 28, :_reduce_22,
1, 29, :_reduce_23,
0, 29, :_reduce_24,
2, 30, :_reduce_25,
1, 30, :_reduce_26,
1, 31, :_reduce_27,
2, 31, :_reduce_28,
2, 27, :_reduce_29,
3, 27, :_reduce_30,
1, 26, :_reduce_31 ]
racc_reduce_n = 32
racc_shift_n = 47
racc_token_table = {
false => 0,
:error => 1,
:T_STRING => 2,
:T_TEXT => 3,
:T_DOCTYPE_START => 4,
:T_DOCTYPE_END => 5,
:T_DOCTYPE_TYPE => 6,
:T_CDATA_START => 7,
:T_CDATA_END => 8,
:T_COMMENT_START => 9,
:T_COMMENT_END => 10,
:T_ELEM_START => 11,
:T_ELEM_NAME => 12,
:T_ELEM_NS => 13,
:T_ELEM_END => 14,
:T_ATTR => 15,
:T_XML_DECL_START => 16,
:T_XML_DECL_END => 17 }
racc_nt_base = 18
racc_use_result_var = false
Racc_arg = [
racc_action_table,
racc_action_check,
racc_action_default,
racc_action_pointer,
racc_goto_table,
racc_goto_check,
racc_goto_default,
racc_goto_pointer,
racc_nt_base,
racc_reduce_table,
racc_token_table,
racc_shift_n,
racc_reduce_n,
racc_use_result_var ]
Racc_token_to_s_table = [
"$end",
"error",
"T_STRING",
"T_TEXT",
"T_DOCTYPE_START",
"T_DOCTYPE_END",
"T_DOCTYPE_TYPE",
"T_CDATA_START",
"T_CDATA_END",
"T_COMMENT_START",
"T_COMMENT_END",
"T_ELEM_START",
"T_ELEM_NAME",
"T_ELEM_NS",
"T_ELEM_END",
"T_ATTR",
"T_XML_DECL_START",
"T_XML_DECL_END",
"$start",
"document",
"expressions",
"expression",
"doctype",
"cdata",
"comment",
"element",
"text",
"xmldecl",
"element_open",
"attributes",
"attributes_",
"attribute" ]
Racc_debug_parser = false
##### State transition tables end #####
# reduce 0 omitted
def _reduce_1(val, _values)
s(:document, val[0])
end
def _reduce_2(val, _values)
s(:document)
end
def _reduce_3(val, _values)
val.compact
end
def _reduce_4(val, _values)
val[0]
end
def _reduce_5(val, _values)
nil
end
# reduce 6 omitted
# reduce 7 omitted
# reduce 8 omitted
# reduce 9 omitted
# reduce 10 omitted
# reduce 11 omitted
def _reduce_12(val, _values)
s(:doctype)
end
def _reduce_13(val, _values)
s(:doctype, val[1])
end
def _reduce_14(val, _values)
s(:doctype, val[1], val[2])
end
def _reduce_15(val, _values)
s(:doctype, val[1], val[2], val[3])
end
def _reduce_16(val, _values)
s(:cdata)
end
def _reduce_17(val, _values)
s(:cdata, val[1])
end
def _reduce_18(val, _values)
s(:comment)
end
def _reduce_19(val, _values)
s(:comment, val[1])
end
def _reduce_20(val, _values)
s(:element, val[0], val[1], val[2])
end
def _reduce_21(val, _values)
[nil, val[1]]
end
def _reduce_22(val, _values)
[val[1], val[2]]
end
def _reduce_23(val, _values)
s(:attributes, val[0])
end
def _reduce_24(val, _values)
nil
end
def _reduce_25(val, _values)
val
end
def _reduce_26(val, _values)
val
end
def _reduce_27(val, _values)
s(:attribute, val[0])
end
def _reduce_28(val, _values)
s(:attribute, val[0], val[1])
end
def _reduce_29(val, _values)
s(:xml_decl)
end
def _reduce_30(val, _values)
s(:xml_decl, val[1])
end
def _reduce_31(val, _values)
s(:text, val[0])
end
def _reduce_none(val, _values)
val[0]
end
end # class Parser
end # module XML
end # module Oga

View File

@ -5,9 +5,9 @@
# It requires every tag to have a closing tag. As such you'll need to enable
# HTML parsing mode when parsing HTML. This can be done as following:
#
# parser = Oga::Parser.new(:html => true)
# parser = Oga::XML::Parser.new(:html => true)
#
class Oga::Parser
class Oga::XML::Parser
token T_STRING T_TEXT
token T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE

View File

@ -1,6 +1,6 @@
require 'spec_helper'
describe Oga::Lexer do
describe Oga::XML::Lexer do
context 'cdata tags' do
example 'lex a cdata tag' do
lex('<![CDATA[foo]]>').should == [

View File

@ -1,6 +1,6 @@
require 'spec_helper'
describe Oga::Lexer do
describe Oga::XML::Lexer do
context 'comments' do
example 'lex a comment' do
lex('<!-- foo -->').should == [

View File

@ -1,6 +1,6 @@
require 'spec_helper'
describe Oga::Lexer do
describe Oga::XML::Lexer do
context 'doctypes' do
example 'lex the HTML5 doctype' do
lex('<!DOCTYPE html>').should == [

View File

@ -1,6 +1,6 @@
require 'spec_helper'
describe Oga::Lexer do
describe Oga::XML::Lexer do
context 'HTML documents' do
example 'lex a basic HTML document' do
html = <<-EOF

View File

@ -1,6 +1,6 @@
require 'spec_helper'
describe Oga::Lexer do
describe Oga::XML::Lexer do
context 'elements' do
example 'lex an opening element' do
lex('<p>').should == [

View File

@ -1,6 +1,6 @@
require 'spec_helper'
describe Oga::Lexer do
describe Oga::XML::Lexer do
context 'regular text' do
example 'lex regular text' do
lex('hello').should == [[:T_TEXT, 'hello', 1]]

View File

@ -1,6 +1,6 @@
require 'spec_helper'
describe Oga::Lexer do
describe Oga::XML::Lexer do
context 'HTML void elements' do
example 'lex a void element that omits the closing /' do
lex('<link>', :html => true).should == [

View File

@ -1,6 +1,6 @@
require 'spec_helper'
describe Oga::Lexer do
describe Oga::XML::Lexer do
context 'XML declaration tags' do
example 'lex a start tag' do
lex('<?xml').should == [[:T_XML_DECL_START, nil, 1]]

View File

@ -1,6 +1,6 @@
require 'spec_helper'
describe Oga::Parser do
describe Oga::XML::Parser do
context 'cdata tags' do
example 'parse a cdata tag' do
parse('<![CDATA[foo]]>').should == s(:document, s(:cdata, 'foo'))

View File

@ -1,6 +1,6 @@
require 'spec_helper'
describe Oga::Parser do
describe Oga::XML::Parser do
context 'comments' do
example 'parse an empty comment' do
parse('<!---->').should == s(:document, s(:comment))

View File

@ -1,6 +1,6 @@
require 'spec_helper'
describe Oga::Parser do
describe Oga::XML::Parser do
context 'doctypes' do
example 'parse a doctype' do
parse('<!DOCTYPE html>').should == s(:document, s(:doctype))

View File

@ -1,6 +1,6 @@
require 'spec_helper'
describe Oga::Parser do
describe Oga::XML::Parser do
context 'HTML documents' do
example 'parse a basic HTML document' do
html = <<-EOF

View File

@ -1,6 +1,6 @@
require 'spec_helper'
describe Oga::Parser do
describe Oga::XML::Parser do
context 'elements' do
example 'parse an empty element' do
parse('<p></p>').should == s(

View File

@ -1,6 +1,6 @@
require 'spec_helper'
describe Oga::Parser do
describe Oga::XML::Parser do
example 'parse regular text' do
parse('foo').should == s(:document, s(:text, 'foo'))
end

View File

@ -1,6 +1,6 @@
require 'spec_helper'
describe Oga::Parser do
describe Oga::XML::Parser do
context 'HTML void elements' do
example 'parse a void element that omits the closing /' do
parse('<link>', :html => true).should == s(

View File

@ -1,6 +1,6 @@
require 'spec_helper'
describe Oga::Parser do
describe Oga::XML::Parser do
context 'XML declaration tags' do
example 'lex an XML declaration tag' do
parse('<?xml hello ?>').should == s(

View File

@ -19,7 +19,7 @@ module Oga
# @return [Array]
#
def lex(input, options = {})
return Oga::Lexer.new(options).lex(input)
return Oga::XML::Lexer.new(options).lex(input)
end
##
@ -30,7 +30,7 @@ module Oga
# @return [Oga::AST::Node]
#
def parse(input, options = {})
return Oga::Parser.new(options).parse(input)
return Oga::XML::Parser.new(options).parse(input)
end
end # ParsingHelpers
end # Oga