Namespaced the lexer/parser under Oga::XML.
With the upcoming XPath and CSS selector lexers/parsers it will be confusing to keep these in the root namespace.
This commit is contained in:
parent
2259061c89
commit
eae13d21ed
6
Rakefile
6
Rakefile
|
@ -5,10 +5,10 @@ require 'cliver'
|
|||
|
||||
GEMSPEC = Gem::Specification.load('oga.gemspec')
|
||||
|
||||
LEXER_INPUT = 'lib/oga/lexer.rl'
|
||||
LEXER_OUTPUT = 'lib/oga/lexer.rb'
|
||||
LEXER_INPUT = 'lib/oga/xml/lexer.rl'
|
||||
LEXER_OUTPUT = 'lib/oga/xml/lexer.rb'
|
||||
|
||||
HTML_PARSER = 'lib/oga/parser.rb'
|
||||
HTML_PARSER = 'lib/oga/xml/parser.rb'
|
||||
|
||||
GENERATED_FILES = ['coverage', 'yardoc', LEXER_OUTPUT, HTML_PARSER]
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ string = 'Hello, how are you doing today?'
|
|||
small = "<![CDATA[#{string}]]>"
|
||||
medium = "<![CDATA[#{string * 1_000}]]>"
|
||||
large = "<![CDATA[#{string * 10_000}]]>"
|
||||
lexer = Oga::Lexer.new
|
||||
lexer = Oga::XML::Lexer.new
|
||||
|
||||
Benchmark.ips do |bench|
|
||||
bench.report 'CDATA with a small body' do
|
||||
|
|
|
@ -4,7 +4,7 @@ require 'benchmark/ips'
|
|||
simple = '<p>Hello world</p>'
|
||||
attributes = '<p class="foo">Hello world</p>'
|
||||
nested = '<p>Hello<strong>world</strong></p>'
|
||||
lexer = Oga::Lexer.new
|
||||
lexer = Oga::XML::Lexer.new
|
||||
|
||||
Benchmark.ips do |bench|
|
||||
bench.report 'text only' do
|
||||
|
|
|
@ -2,7 +2,7 @@ require_relative '../../lib/oga'
|
|||
require 'benchmark/ips'
|
||||
|
||||
html = File.read(File.expand_path('../../fixtures/hrs.html', __FILE__))
|
||||
lexer = Oga::Lexer.new(:html => true)
|
||||
lexer = Oga::XML::Lexer.new(:html => true)
|
||||
|
||||
Benchmark.ips do |bench|
|
||||
bench.report 'lex HTML' do
|
||||
|
|
|
@ -2,7 +2,7 @@ require_relative '../../lib/oga'
|
|||
require 'benchmark'
|
||||
|
||||
html = File.read(File.expand_path('../../fixtures/hrs.html', __FILE__))
|
||||
lexer = Oga::Lexer.new(:html => true)
|
||||
lexer = Oga::XML::Lexer.new(:html => true)
|
||||
|
||||
Benchmark.bmbm(20) do |bench|
|
||||
bench.report 'lex HTML' do
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
require 'ast'
|
||||
|
||||
require_relative 'oga/ast/node'
|
||||
require_relative 'oga/lexer'
|
||||
require_relative 'oga/parser'
|
||||
require_relative 'oga/xml/lexer'
|
||||
require_relative 'oga/xml/parser'
|
||||
|
|
508
lib/oga/lexer.rl
508
lib/oga/lexer.rl
|
@ -1,508 +0,0 @@
|
|||
%%machine lexer; # %
|
||||
|
||||
module Oga
|
||||
##
|
||||
# Low level lexer that supports both XML and HTML (using an extra option). To
|
||||
# lex HTML input set the `:html` option to `true` when creating an instance
|
||||
# of the lexer:
|
||||
#
|
||||
# lexer = Oga::Lexer.new(:html => true)
|
||||
#
|
||||
# @!attribute [r] html
|
||||
# @return [TrueClass|FalseClass]
|
||||
#
|
||||
class Lexer
|
||||
%% write data; # %
|
||||
|
||||
attr_reader :html
|
||||
|
||||
##
|
||||
# Names of the HTML void elements that should be handled when HTML lexing
|
||||
# is enabled.
|
||||
#
|
||||
# @return [Array]
|
||||
#
|
||||
HTML_VOID_ELEMENTS = [
|
||||
'area',
|
||||
'base',
|
||||
'br',
|
||||
'col',
|
||||
'command',
|
||||
'embed',
|
||||
'hr',
|
||||
'img',
|
||||
'input',
|
||||
'keygen',
|
||||
'link',
|
||||
'meta',
|
||||
'param',
|
||||
'source',
|
||||
'track',
|
||||
'wbr'
|
||||
]
|
||||
|
||||
# Lazy way of forwarding instance method calls used internally by Ragel to
|
||||
# their corresponding class methods.
|
||||
private_methods.grep(/^_lexer_/).each do |name|
|
||||
define_method(name) do
|
||||
return self.class.send(name)
|
||||
end
|
||||
|
||||
private(name)
|
||||
end
|
||||
|
||||
##
|
||||
# @param [Hash] options
|
||||
#
|
||||
# @option options [Symbol] :html When set to `true` the lexer will treat
|
||||
# the input as HTML instead of SGML/XML. This makes it possible to lex
|
||||
# HTML void elements such as `<link href="">`.
|
||||
#
|
||||
def initialize(options = {})
|
||||
options.each do |key, value|
|
||||
instance_variable_set("@#{key}", value) if respond_to?(key)
|
||||
end
|
||||
|
||||
reset
|
||||
end
|
||||
|
||||
##
|
||||
# Resets the internal state of the lexer. Typically you don't need to call
|
||||
# this method yourself as its called by #lex after lexing a given String.
|
||||
#
|
||||
def reset
|
||||
@line = 1
|
||||
@data = nil
|
||||
@ts = nil
|
||||
@te = nil
|
||||
@tokens = []
|
||||
@stack = []
|
||||
@top = 0
|
||||
@elements = []
|
||||
|
||||
@buffer_start_position = nil
|
||||
end
|
||||
|
||||
##
|
||||
# Lexes the supplied String and returns an Array of tokens. Each token is
|
||||
# an Array in the following format:
|
||||
#
|
||||
# [TYPE, VALUE]
|
||||
#
|
||||
# The type is a symbol, the value is either nil or a String.
|
||||
#
|
||||
# @param [String] data The string to lex.
|
||||
# @return [Array]
|
||||
#
|
||||
def lex(data)
|
||||
@data = data.unpack('U*')
|
||||
lexer_start = self.class.lexer_start
|
||||
eof = data.length
|
||||
|
||||
%% write init;
|
||||
%% write exec;
|
||||
|
||||
tokens = @tokens
|
||||
|
||||
reset
|
||||
|
||||
return tokens
|
||||
end
|
||||
|
||||
##
|
||||
# @return [TrueClass|FalseClass]
|
||||
#
|
||||
def html?
|
||||
return !!html
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
##
|
||||
# @param [Fixnum] amount The amount of lines to advance.
|
||||
#
|
||||
def advance_line(amount = 1)
|
||||
@line += amount
|
||||
end
|
||||
|
||||
##
|
||||
# Emits a token who's value is based on the supplied start/stop position.
|
||||
#
|
||||
# @param [Symbol] type The token type.
|
||||
# @param [Fixnum] start
|
||||
# @param [Fixnum] stop
|
||||
#
|
||||
# @see #text
|
||||
# @see #add_token
|
||||
#
|
||||
def t(type, start = @ts, stop = @te)
|
||||
value = text(start, stop)
|
||||
|
||||
add_token(type, value)
|
||||
end
|
||||
|
||||
##
|
||||
# Returns the text of the current buffer based on the supplied start and
|
||||
# stop position.
|
||||
#
|
||||
# By default `@ts` and `@te` are used as the start/stop position.
|
||||
#
|
||||
# @param [Fixnum] start
|
||||
# @param [Fixnum] stop
|
||||
# @return [String]
|
||||
#
|
||||
def text(start = @ts, stop = @te)
|
||||
return @data[start...stop].pack('U*')
|
||||
end
|
||||
|
||||
##
|
||||
# Adds a token with the given type and value to the list.
|
||||
#
|
||||
# @param [Symbol] type The token type.
|
||||
# @param [String] value The token value.
|
||||
#
|
||||
def add_token(type, value = nil)
|
||||
token = [type, value, @line]
|
||||
|
||||
@tokens << token
|
||||
end
|
||||
|
||||
##
|
||||
# Enables buffering starting at the given position.
|
||||
#
|
||||
# @param [Fixnum] position The start position of the buffer, set to `@te`
|
||||
# by default.
|
||||
#
|
||||
def start_buffer(position = @te)
|
||||
@buffer_start_position = position
|
||||
end
|
||||
|
||||
##
|
||||
# Returns `true` if we're currently buffering.
|
||||
#
|
||||
# @return [TrueClass|FalseClass]
|
||||
#
|
||||
def buffering?
|
||||
return !!@buffer_start_position
|
||||
end
|
||||
|
||||
##
|
||||
# Emits the current buffer if we have any. The current line number is
|
||||
# advanced based on the amount of newlines in the buffer.
|
||||
#
|
||||
# @param [Fixnum] position The end position of the buffer, set to `@ts` by
|
||||
# default.
|
||||
#
|
||||
# @param [Symbol] type The type of node to emit.
|
||||
#
|
||||
def emit_buffer(position = @ts, type = :T_TEXT)
|
||||
return unless @buffer_start_position
|
||||
|
||||
content = text(@buffer_start_position, position)
|
||||
|
||||
unless content.empty?
|
||||
add_token(type, content)
|
||||
|
||||
lines = content.count("\n")
|
||||
|
||||
advance_line(lines) if lines > 0
|
||||
end
|
||||
|
||||
@buffer_start_position = nil
|
||||
end
|
||||
|
||||
##
|
||||
# Returns the name of the element we're currently in.
|
||||
#
|
||||
# @return [String]
|
||||
#
|
||||
def current_element
|
||||
return @elements.last
|
||||
end
|
||||
|
||||
%%{
|
||||
# Use instance variables for `ts` and friends.
|
||||
access @;
|
||||
getkey (@data[p] || 0);
|
||||
|
||||
newline = '\n' | '\r\n';
|
||||
whitespace = [ \t];
|
||||
|
||||
# Strings
|
||||
#
|
||||
# Strings in HTML can either be single or double quoted. If a string
|
||||
# starts with one of these quotes it must be closed with the same type of
|
||||
# quote.
|
||||
dquote = '"';
|
||||
squote = "'";
|
||||
|
||||
action start_string_dquote {
|
||||
start_buffer
|
||||
|
||||
fcall string_dquote;
|
||||
}
|
||||
|
||||
action start_string_squote {
|
||||
start_buffer
|
||||
|
||||
fcall string_squote;
|
||||
}
|
||||
|
||||
# Machine for processing double quoted strings.
|
||||
string_dquote := |*
|
||||
dquote => {
|
||||
emit_buffer(@ts, :T_STRING)
|
||||
fret;
|
||||
};
|
||||
|
||||
any;
|
||||
*|;
|
||||
|
||||
# Machine for processing single quoted strings.
|
||||
string_squote := |*
|
||||
squote => {
|
||||
emit_buffer(@ts, :T_STRING)
|
||||
fret;
|
||||
};
|
||||
|
||||
any;
|
||||
*|;
|
||||
|
||||
# DOCTYPES
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
|
||||
#
|
||||
# These rules support the 3 flavours of doctypes:
|
||||
#
|
||||
# 1. Normal doctypes, as introduced in the HTML5 specification.
|
||||
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
|
||||
# 3. Legacy doctypes
|
||||
#
|
||||
doctype_start = '<!DOCTYPE'i whitespace+ 'HTML'i;
|
||||
|
||||
action start_doctype {
|
||||
emit_buffer
|
||||
add_token(:T_DOCTYPE_START)
|
||||
fcall doctype;
|
||||
}
|
||||
|
||||
# Machine for processing doctypes. Doctype values such as the public and
|
||||
# system IDs are treated as T_STRING tokens.
|
||||
doctype := |*
|
||||
'PUBLIC' | 'SYSTEM' => { t(:T_DOCTYPE_TYPE) };
|
||||
|
||||
# Lex the public/system IDs as regular strings.
|
||||
dquote => start_string_dquote;
|
||||
squote => start_string_squote;
|
||||
|
||||
# Whitespace inside doctypes is ignored since there's no point in
|
||||
# including it.
|
||||
whitespace;
|
||||
|
||||
'>' => {
|
||||
add_token(:T_DOCTYPE_END)
|
||||
fret;
|
||||
};
|
||||
*|;
|
||||
|
||||
# CDATA
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
|
||||
#
|
||||
# CDATA tags are broken up into 3 parts: the start, the content and the
|
||||
# end tag.
|
||||
#
|
||||
# In HTML CDATA tags have no meaning/are not supported. Oga does
|
||||
# support them but treats their contents as plain text.
|
||||
#
|
||||
cdata_start = '<![CDATA[';
|
||||
cdata_end = ']]>';
|
||||
|
||||
action start_cdata {
|
||||
emit_buffer
|
||||
add_token(:T_CDATA_START)
|
||||
|
||||
start_buffer
|
||||
|
||||
fcall cdata;
|
||||
}
|
||||
|
||||
# Machine that for processing the contents of CDATA tags. Everything
|
||||
# inside a CDATA tag is treated as plain text.
|
||||
cdata := |*
|
||||
cdata_end => {
|
||||
emit_buffer
|
||||
add_token(:T_CDATA_END)
|
||||
|
||||
fret;
|
||||
};
|
||||
|
||||
any;
|
||||
*|;
|
||||
|
||||
# Comments
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#comments
|
||||
#
|
||||
# Comments are lexed into 3 parts: the start tag, the content and the end
|
||||
# tag.
|
||||
#
|
||||
# Unlike the W3 specification these rules *do* allow character sequences
|
||||
# such as `--` and `->`. Putting extra checks in for these sequences
|
||||
# would actually make the rules/actions more complex.
|
||||
#
|
||||
comment_start = '<!--';
|
||||
comment_end = '-->';
|
||||
|
||||
action start_comment {
|
||||
emit_buffer
|
||||
add_token(:T_COMMENT_START)
|
||||
|
||||
start_buffer
|
||||
|
||||
fcall comment;
|
||||
}
|
||||
|
||||
# Machine used for processing the contents of a comment. Everything
|
||||
# inside a comment is treated as plain text (similar to CDATA tags).
|
||||
comment := |*
|
||||
comment_end => {
|
||||
emit_buffer
|
||||
add_token(:T_COMMENT_END)
|
||||
|
||||
fret;
|
||||
};
|
||||
|
||||
any;
|
||||
*|;
|
||||
|
||||
# XML declaration tags
|
||||
#
|
||||
# http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
|
||||
#
|
||||
xml_decl_start = '<?xml';
|
||||
xml_decl_end = '?>';
|
||||
|
||||
action start_xml_decl {
|
||||
emit_buffer
|
||||
add_token(:T_XML_DECL_START)
|
||||
|
||||
start_buffer
|
||||
|
||||
fcall xml_decl;
|
||||
}
|
||||
|
||||
# Machine that processes the contents of an XML declaration tag.
|
||||
xml_decl := |*
|
||||
xml_decl_end => {
|
||||
emit_buffer
|
||||
add_token(:T_XML_DECL_END)
|
||||
|
||||
fret;
|
||||
};
|
||||
|
||||
any;
|
||||
*|;
|
||||
|
||||
# Elements
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
|
||||
#
|
||||
|
||||
# Action that creates the tokens for the opening tag, name and namespace
|
||||
# (if any). Remaining work is delegated to a dedicated machine.
|
||||
action start_element {
|
||||
emit_buffer
|
||||
add_token(:T_ELEM_START)
|
||||
|
||||
# Add the element name. If the name includes a namespace we'll break
|
||||
# the name up into two separate tokens.
|
||||
name = text(@ts + 1)
|
||||
|
||||
if name.include?(':')
|
||||
ns, name = name.split(':')
|
||||
|
||||
add_token(:T_ELEM_NS, ns)
|
||||
end
|
||||
|
||||
@elements << name
|
||||
|
||||
add_token(:T_ELEM_NAME, name)
|
||||
|
||||
fcall element_head;
|
||||
}
|
||||
|
||||
element_name = [a-zA-Z0-9\-_:]+;
|
||||
element_start = '<' element_name;
|
||||
|
||||
# Machine used for processing the characters inside a element head. An
|
||||
# element head is everything between `<NAME` (where NAME is the element
|
||||
# name) and `>`.
|
||||
#
|
||||
# For example, in `<p foo="bar">` the element head is ` foo="bar"`.
|
||||
#
|
||||
element_head := |*
|
||||
whitespace | '=';
|
||||
|
||||
newline => { advance_line };
|
||||
|
||||
# Attribute names.
|
||||
element_name => { t(:T_ATTR) };
|
||||
|
||||
# Attribute values.
|
||||
dquote => start_string_dquote;
|
||||
squote => start_string_squote;
|
||||
|
||||
# The closing character of the open tag.
|
||||
('>' | '/') => {
|
||||
fhold;
|
||||
fret;
|
||||
};
|
||||
*|;
|
||||
|
||||
main := |*
|
||||
element_start => start_element;
|
||||
doctype_start => start_doctype;
|
||||
cdata_start => start_cdata;
|
||||
comment_start => start_comment;
|
||||
xml_decl_start => start_xml_decl;
|
||||
|
||||
# Enter the body of the tag. If HTML mode is enabled and the current
|
||||
# element is a void element we'll close it and bail out.
|
||||
'>' => {
|
||||
if html? and HTML_VOID_ELEMENTS.include?(current_element)
|
||||
add_token(:T_ELEM_END, nil)
|
||||
@elements.pop
|
||||
end
|
||||
};
|
||||
|
||||
# Regular closing tags.
|
||||
'</' element_name '>' => {
|
||||
emit_buffer
|
||||
add_token(:T_ELEM_END, nil)
|
||||
|
||||
@elements.pop
|
||||
};
|
||||
|
||||
# Self closing elements that are not handled by the HTML mode.
|
||||
'/>' => {
|
||||
add_token(:T_ELEM_END, nil)
|
||||
|
||||
@elements.pop
|
||||
};
|
||||
|
||||
# Note that this rule should be declared at the very bottom as it will
|
||||
# otherwise take precedence over the other rules.
|
||||
any => {
|
||||
# First character, start buffering (unless we already are buffering).
|
||||
start_buffer(@ts) unless buffering?
|
||||
|
||||
# EOF, emit the text buffer.
|
||||
if @te == eof
|
||||
emit_buffer(@te)
|
||||
end
|
||||
};
|
||||
*|;
|
||||
}%%
|
||||
end # Lexer
|
||||
end # Oga
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,510 @@
|
|||
%%machine lexer; # %
|
||||
|
||||
module Oga
|
||||
module XML
|
||||
##
|
||||
# Low level lexer that supports both XML and HTML (using an extra option). To
|
||||
# lex HTML input set the `:html` option to `true` when creating an instance
|
||||
# of the lexer:
|
||||
#
|
||||
# lexer = Oga::Lexer.new(:html => true)
|
||||
#
|
||||
# @!attribute [r] html
|
||||
# @return [TrueClass|FalseClass]
|
||||
#
|
||||
class Lexer
|
||||
%% write data; # %
|
||||
|
||||
attr_reader :html
|
||||
|
||||
##
|
||||
# Names of the HTML void elements that should be handled when HTML lexing
|
||||
# is enabled.
|
||||
#
|
||||
# @return [Array]
|
||||
#
|
||||
HTML_VOID_ELEMENTS = [
|
||||
'area',
|
||||
'base',
|
||||
'br',
|
||||
'col',
|
||||
'command',
|
||||
'embed',
|
||||
'hr',
|
||||
'img',
|
||||
'input',
|
||||
'keygen',
|
||||
'link',
|
||||
'meta',
|
||||
'param',
|
||||
'source',
|
||||
'track',
|
||||
'wbr'
|
||||
]
|
||||
|
||||
# Lazy way of forwarding instance method calls used internally by Ragel to
|
||||
# their corresponding class methods.
|
||||
private_methods.grep(/^_lexer_/).each do |name|
|
||||
define_method(name) do
|
||||
return self.class.send(name)
|
||||
end
|
||||
|
||||
private(name)
|
||||
end
|
||||
|
||||
##
|
||||
# @param [Hash] options
|
||||
#
|
||||
# @option options [Symbol] :html When set to `true` the lexer will treat
|
||||
# the input as HTML instead of SGML/XML. This makes it possible to lex
|
||||
# HTML void elements such as `<link href="">`.
|
||||
#
|
||||
def initialize(options = {})
|
||||
options.each do |key, value|
|
||||
instance_variable_set("@#{key}", value) if respond_to?(key)
|
||||
end
|
||||
|
||||
reset
|
||||
end
|
||||
|
||||
##
|
||||
# Resets the internal state of the lexer. Typically you don't need to call
|
||||
# this method yourself as its called by #lex after lexing a given String.
|
||||
#
|
||||
def reset
|
||||
@line = 1
|
||||
@data = nil
|
||||
@ts = nil
|
||||
@te = nil
|
||||
@tokens = []
|
||||
@stack = []
|
||||
@top = 0
|
||||
@elements = []
|
||||
|
||||
@buffer_start_position = nil
|
||||
end
|
||||
|
||||
##
|
||||
# Lexes the supplied String and returns an Array of tokens. Each token is
|
||||
# an Array in the following format:
|
||||
#
|
||||
# [TYPE, VALUE]
|
||||
#
|
||||
# The type is a symbol, the value is either nil or a String.
|
||||
#
|
||||
# @param [String] data The string to lex.
|
||||
# @return [Array]
|
||||
#
|
||||
def lex(data)
|
||||
@data = data.unpack('U*')
|
||||
lexer_start = self.class.lexer_start
|
||||
eof = data.length
|
||||
|
||||
%% write init;
|
||||
%% write exec;
|
||||
|
||||
tokens = @tokens
|
||||
|
||||
reset
|
||||
|
||||
return tokens
|
||||
end
|
||||
|
||||
##
|
||||
# @return [TrueClass|FalseClass]
|
||||
#
|
||||
def html?
|
||||
return !!html
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
##
|
||||
# @param [Fixnum] amount The amount of lines to advance.
|
||||
#
|
||||
def advance_line(amount = 1)
|
||||
@line += amount
|
||||
end
|
||||
|
||||
##
|
||||
# Emits a token who's value is based on the supplied start/stop position.
|
||||
#
|
||||
# @param [Symbol] type The token type.
|
||||
# @param [Fixnum] start
|
||||
# @param [Fixnum] stop
|
||||
#
|
||||
# @see #text
|
||||
# @see #add_token
|
||||
#
|
||||
def t(type, start = @ts, stop = @te)
|
||||
value = text(start, stop)
|
||||
|
||||
add_token(type, value)
|
||||
end
|
||||
|
||||
##
|
||||
# Returns the text of the current buffer based on the supplied start and
|
||||
# stop position.
|
||||
#
|
||||
# By default `@ts` and `@te` are used as the start/stop position.
|
||||
#
|
||||
# @param [Fixnum] start
|
||||
# @param [Fixnum] stop
|
||||
# @return [String]
|
||||
#
|
||||
def text(start = @ts, stop = @te)
|
||||
return @data[start...stop].pack('U*')
|
||||
end
|
||||
|
||||
##
|
||||
# Adds a token with the given type and value to the list.
|
||||
#
|
||||
# @param [Symbol] type The token type.
|
||||
# @param [String] value The token value.
|
||||
#
|
||||
def add_token(type, value = nil)
|
||||
token = [type, value, @line]
|
||||
|
||||
@tokens << token
|
||||
end
|
||||
|
||||
##
|
||||
# Enables buffering starting at the given position.
|
||||
#
|
||||
# @param [Fixnum] position The start position of the buffer, set to `@te`
|
||||
# by default.
|
||||
#
|
||||
def start_buffer(position = @te)
|
||||
@buffer_start_position = position
|
||||
end
|
||||
|
||||
##
|
||||
# Returns `true` if we're currently buffering.
|
||||
#
|
||||
# @return [TrueClass|FalseClass]
|
||||
#
|
||||
def buffering?
|
||||
return !!@buffer_start_position
|
||||
end
|
||||
|
||||
##
|
||||
# Emits the current buffer if we have any. The current line number is
|
||||
# advanced based on the amount of newlines in the buffer.
|
||||
#
|
||||
# @param [Fixnum] position The end position of the buffer, set to `@ts` by
|
||||
# default.
|
||||
#
|
||||
# @param [Symbol] type The type of node to emit.
|
||||
#
|
||||
def emit_buffer(position = @ts, type = :T_TEXT)
|
||||
return unless @buffer_start_position
|
||||
|
||||
content = text(@buffer_start_position, position)
|
||||
|
||||
unless content.empty?
|
||||
add_token(type, content)
|
||||
|
||||
lines = content.count("\n")
|
||||
|
||||
advance_line(lines) if lines > 0
|
||||
end
|
||||
|
||||
@buffer_start_position = nil
|
||||
end
|
||||
|
||||
##
|
||||
# Returns the name of the element we're currently in.
|
||||
#
|
||||
# @return [String]
|
||||
#
|
||||
def current_element
|
||||
return @elements.last
|
||||
end
|
||||
|
||||
%%{
|
||||
# Use instance variables for `ts` and friends.
|
||||
access @;
|
||||
getkey (@data[p] || 0);
|
||||
|
||||
newline = '\n' | '\r\n';
|
||||
whitespace = [ \t];
|
||||
|
||||
# Strings
|
||||
#
|
||||
# Strings in HTML can either be single or double quoted. If a string
|
||||
# starts with one of these quotes it must be closed with the same type of
|
||||
# quote.
|
||||
dquote = '"';
|
||||
squote = "'";
|
||||
|
||||
action start_string_dquote {
|
||||
start_buffer
|
||||
|
||||
fcall string_dquote;
|
||||
}
|
||||
|
||||
action start_string_squote {
|
||||
start_buffer
|
||||
|
||||
fcall string_squote;
|
||||
}
|
||||
|
||||
# Machine for processing double quoted strings.
|
||||
string_dquote := |*
|
||||
dquote => {
|
||||
emit_buffer(@ts, :T_STRING)
|
||||
fret;
|
||||
};
|
||||
|
||||
any;
|
||||
*|;
|
||||
|
||||
# Machine for processing single quoted strings.
|
||||
string_squote := |*
|
||||
squote => {
|
||||
emit_buffer(@ts, :T_STRING)
|
||||
fret;
|
||||
};
|
||||
|
||||
any;
|
||||
*|;
|
||||
|
||||
# DOCTYPES
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
|
||||
#
|
||||
# These rules support the 3 flavours of doctypes:
|
||||
#
|
||||
# 1. Normal doctypes, as introduced in the HTML5 specification.
|
||||
# 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
|
||||
# 3. Legacy doctypes
|
||||
#
|
||||
doctype_start = '<!DOCTYPE'i whitespace+ 'HTML'i;
|
||||
|
||||
action start_doctype {
|
||||
emit_buffer
|
||||
add_token(:T_DOCTYPE_START)
|
||||
fcall doctype;
|
||||
}
|
||||
|
||||
# Machine for processing doctypes. Doctype values such as the public and
|
||||
# system IDs are treated as T_STRING tokens.
|
||||
doctype := |*
|
||||
'PUBLIC' | 'SYSTEM' => { t(:T_DOCTYPE_TYPE) };
|
||||
|
||||
# Lex the public/system IDs as regular strings.
|
||||
dquote => start_string_dquote;
|
||||
squote => start_string_squote;
|
||||
|
||||
# Whitespace inside doctypes is ignored since there's no point in
|
||||
# including it.
|
||||
whitespace;
|
||||
|
||||
'>' => {
|
||||
add_token(:T_DOCTYPE_END)
|
||||
fret;
|
||||
};
|
||||
*|;
|
||||
|
||||
# CDATA
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
|
||||
#
|
||||
# CDATA tags are broken up into 3 parts: the start, the content and the
|
||||
# end tag.
|
||||
#
|
||||
# In HTML CDATA tags have no meaning/are not supported. Oga does
|
||||
# support them but treats their contents as plain text.
|
||||
#
|
||||
cdata_start = '<![CDATA[';
|
||||
cdata_end = ']]>';
|
||||
|
||||
action start_cdata {
|
||||
emit_buffer
|
||||
add_token(:T_CDATA_START)
|
||||
|
||||
start_buffer
|
||||
|
||||
fcall cdata;
|
||||
}
|
||||
|
||||
# Machine that for processing the contents of CDATA tags. Everything
|
||||
# inside a CDATA tag is treated as plain text.
|
||||
cdata := |*
|
||||
cdata_end => {
|
||||
emit_buffer
|
||||
add_token(:T_CDATA_END)
|
||||
|
||||
fret;
|
||||
};
|
||||
|
||||
any;
|
||||
*|;
|
||||
|
||||
# Comments
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#comments
|
||||
#
|
||||
# Comments are lexed into 3 parts: the start tag, the content and the end
|
||||
# tag.
|
||||
#
|
||||
# Unlike the W3 specification these rules *do* allow character sequences
|
||||
# such as `--` and `->`. Putting extra checks in for these sequences
|
||||
# would actually make the rules/actions more complex.
|
||||
#
|
||||
comment_start = '<!--';
|
||||
comment_end = '-->';
|
||||
|
||||
action start_comment {
|
||||
emit_buffer
|
||||
add_token(:T_COMMENT_START)
|
||||
|
||||
start_buffer
|
||||
|
||||
fcall comment;
|
||||
}
|
||||
|
||||
# Machine used for processing the contents of a comment. Everything
|
||||
# inside a comment is treated as plain text (similar to CDATA tags).
|
||||
comment := |*
|
||||
comment_end => {
|
||||
emit_buffer
|
||||
add_token(:T_COMMENT_END)
|
||||
|
||||
fret;
|
||||
};
|
||||
|
||||
any;
|
||||
*|;
|
||||
|
||||
# XML declaration tags
|
||||
#
|
||||
# http://www.w3.org/TR/REC-xml/#sec-prolog-dtd
|
||||
#
|
||||
xml_decl_start = '<?xml';
|
||||
xml_decl_end = '?>';
|
||||
|
||||
action start_xml_decl {
|
||||
emit_buffer
|
||||
add_token(:T_XML_DECL_START)
|
||||
|
||||
start_buffer
|
||||
|
||||
fcall xml_decl;
|
||||
}
|
||||
|
||||
# Machine that processes the contents of an XML declaration tag.
|
||||
xml_decl := |*
|
||||
xml_decl_end => {
|
||||
emit_buffer
|
||||
add_token(:T_XML_DECL_END)
|
||||
|
||||
fret;
|
||||
};
|
||||
|
||||
any;
|
||||
*|;
|
||||
|
||||
# Elements
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
|
||||
#
|
||||
|
||||
# Action that creates the tokens for the opening tag, name and namespace
|
||||
# (if any). Remaining work is delegated to a dedicated machine.
|
||||
action start_element {
|
||||
emit_buffer
|
||||
add_token(:T_ELEM_START)
|
||||
|
||||
# Add the element name. If the name includes a namespace we'll break
|
||||
# the name up into two separate tokens.
|
||||
name = text(@ts + 1)
|
||||
|
||||
if name.include?(':')
|
||||
ns, name = name.split(':')
|
||||
|
||||
add_token(:T_ELEM_NS, ns)
|
||||
end
|
||||
|
||||
@elements << name
|
||||
|
||||
add_token(:T_ELEM_NAME, name)
|
||||
|
||||
fcall element_head;
|
||||
}
|
||||
|
||||
element_name = [a-zA-Z0-9\-_:]+;
|
||||
element_start = '<' element_name;
|
||||
|
||||
# Machine used for processing the characters inside a element head. An
|
||||
# element head is everything between `<NAME` (where NAME is the element
|
||||
# name) and `>`.
|
||||
#
|
||||
# For example, in `<p foo="bar">` the element head is ` foo="bar"`.
|
||||
#
|
||||
element_head := |*
|
||||
whitespace | '=';
|
||||
|
||||
newline => { advance_line };
|
||||
|
||||
# Attribute names.
|
||||
element_name => { t(:T_ATTR) };
|
||||
|
||||
# Attribute values.
|
||||
dquote => start_string_dquote;
|
||||
squote => start_string_squote;
|
||||
|
||||
# The closing character of the open tag.
|
||||
('>' | '/') => {
|
||||
fhold;
|
||||
fret;
|
||||
};
|
||||
*|;
|
||||
|
||||
main := |*
|
||||
element_start => start_element;
|
||||
doctype_start => start_doctype;
|
||||
cdata_start => start_cdata;
|
||||
comment_start => start_comment;
|
||||
xml_decl_start => start_xml_decl;
|
||||
|
||||
# Enter the body of the tag. If HTML mode is enabled and the current
|
||||
# element is a void element we'll close it and bail out.
|
||||
'>' => {
|
||||
if html? and HTML_VOID_ELEMENTS.include?(current_element)
|
||||
add_token(:T_ELEM_END, nil)
|
||||
@elements.pop
|
||||
end
|
||||
};
|
||||
|
||||
# Regular closing tags.
|
||||
'</' element_name '>' => {
|
||||
emit_buffer
|
||||
add_token(:T_ELEM_END, nil)
|
||||
|
||||
@elements.pop
|
||||
};
|
||||
|
||||
# Self closing elements that are not handled by the HTML mode.
|
||||
'/>' => {
|
||||
add_token(:T_ELEM_END, nil)
|
||||
|
||||
@elements.pop
|
||||
};
|
||||
|
||||
# Note that this rule should be declared at the very bottom as it will
|
||||
# otherwise take precedence over the other rules.
|
||||
any => {
|
||||
# First character, start buffering (unless we already are buffering).
|
||||
start_buffer(@ts) unless buffering?
|
||||
|
||||
# EOF, emit the text buffer.
|
||||
if @te == eof
|
||||
emit_buffer(@te)
|
||||
end
|
||||
};
|
||||
*|;
|
||||
}%%
|
||||
end # Lexer
|
||||
end # XML
|
||||
end # Oga
|
|
@ -0,0 +1,402 @@
|
|||
#
|
||||
# DO NOT MODIFY!!!!
|
||||
# This file is automatically generated by Racc 1.4.11
|
||||
# from Racc grammer file "".
|
||||
#
|
||||
|
||||
require 'racc/parser.rb'
|
||||
module Oga
|
||||
module XML
|
||||
class Parser < Racc::Parser
|
||||
|
||||
##
|
||||
# @param [Hash] options
|
||||
#
|
||||
# @option options [TrueClass|FalseClass] :html Enables HTML parsing mode.
|
||||
# @see Oga::Lexer#initialize
|
||||
#
|
||||
def initialize(options = {})
|
||||
@lexer = Lexer.new(options)
|
||||
end
|
||||
|
||||
##
|
||||
# Resets the internal state of the parser.
|
||||
#
|
||||
def reset
|
||||
@lines = []
|
||||
@line = 1
|
||||
end
|
||||
|
||||
##
|
||||
# Emits a new AST token.
|
||||
#
|
||||
# @param [Symbol] type
|
||||
# @param [Array] children
|
||||
#
|
||||
def s(type, *children)
|
||||
return AST::Node.new(
|
||||
type,
|
||||
children.flatten,
|
||||
:line => @line
|
||||
)
|
||||
end
|
||||
|
||||
##
|
||||
# Returns the next token from the lexer.
|
||||
#
|
||||
# @return [Array]
|
||||
#
|
||||
def next_token
|
||||
type, value, line = @tokens.shift
|
||||
|
||||
@line = line if line
|
||||
|
||||
return type ? [type, value] : [false, false]
|
||||
end
|
||||
|
||||
##
|
||||
# @param [Fixnum] type The type of token the error occured on.
|
||||
# @param [String] value The value of the token.
|
||||
# @param [Array] stack The current stack of parsed nodes.
|
||||
# @raise [Racc::ParseError]
|
||||
#
|
||||
def on_error(type, value, stack)
|
||||
name = token_to_str(type)
|
||||
index = @line - 1
|
||||
lines = ''
|
||||
|
||||
# Show up to 5 lines before and after the offending line (if they exist).
|
||||
(-5..5).each do |offset|
|
||||
line = @lines[index + offset]
|
||||
number = @line + offset
|
||||
|
||||
if line and number > 0
|
||||
if offset == 0
|
||||
prefix = '=> '
|
||||
else
|
||||
prefix = ' '
|
||||
end
|
||||
|
||||
lines << "#{prefix}#{number}: #{line.strip}\n"
|
||||
end
|
||||
end
|
||||
|
||||
raise Racc::ParseError, <<-EOF
|
||||
Unexpected #{name} with value #{value.inspect} on line #{@line}:
|
||||
|
||||
#{lines}
|
||||
EOF
|
||||
end
|
||||
|
||||
##
|
||||
# Parses the supplied string and returns the AST.
|
||||
#
|
||||
# @example
|
||||
# parser = Oga::Parser.new
|
||||
# ast = parser.parse('<foo>bar</foo>')
|
||||
#
|
||||
# @param [String] string
|
||||
# @return [Oga::AST::Node]
|
||||
#
|
||||
def parse(string)
|
||||
@lines = string.lines
|
||||
@tokens = @lexer.lex(string)
|
||||
ast = do_parse
|
||||
|
||||
reset
|
||||
|
||||
return ast
|
||||
end
|
||||
|
||||
# vim: set ft=racc:
|
||||
##### State transition tables begin ###
|
||||
|
||||
racc_action_table = [
|
||||
16, 40, 16, 10, 24, 37, 11, 22, 12, 28,
|
||||
14, 23, 21, 45, 31, 15, 16, 10, 44, 28,
|
||||
11, 43, 12, 36, 14, 35, 16, 10, 34, 15,
|
||||
11, 41, 12, 42, 14, 33, 16, 10, 17, 15,
|
||||
11, 46, 12, nil, 14, 29, 30, 19, 20, 15 ]
|
||||
|
||||
racc_action_check = [
|
||||
15, 28, 38, 38, 12, 24, 38, 11, 38, 13,
|
||||
38, 12, 11, 38, 15, 38, 2, 2, 35, 26,
|
||||
2, 35, 2, 22, 2, 20, 25, 25, 20, 2,
|
||||
25, 30, 25, 32, 25, 17, 0, 0, 1, 25,
|
||||
0, 44, 0, nil, 0, 14, 14, 10, 10, 0 ]
|
||||
|
||||
racc_action_pointer = [
|
||||
33, 38, 13, nil, nil, nil, nil, nil, nil, nil,
|
||||
42, 4, 1, -6, 33, -3, nil, 35, nil, nil,
|
||||
23, nil, 15, nil, -5, 23, 4, nil, -1, nil,
|
||||
19, nil, 16, nil, nil, 16, nil, nil, -1, nil,
|
||||
nil, nil, nil, nil, 36, nil, nil ]
|
||||
|
||||
racc_action_default = [
|
||||
-2, -32, -1, -4, -6, -7, -8, -9, -10, -11,
|
||||
-32, -32, -32, -24, -32, -32, -31, -32, -3, -12,
|
||||
-32, -16, -32, -18, -32, -5, -23, -26, -27, -21,
|
||||
-32, -29, -32, 47, -13, -32, -17, -19, -32, -25,
|
||||
-28, -22, -30, -14, -32, -20, -15 ]
|
||||
|
||||
racc_goto_table = [
|
||||
18, 2, 27, 32, 25, 26, 1, nil, nil, nil,
|
||||
nil, nil, nil, nil, nil, 39, nil, nil, nil, nil,
|
||||
nil, nil, nil, nil, nil, nil, 38, nil, nil, nil,
|
||||
nil, nil, nil, nil, nil, nil, 18 ]
|
||||
|
||||
racc_goto_check = [
|
||||
3, 2, 13, 8, 11, 12, 1, nil, nil, nil,
|
||||
nil, nil, nil, nil, nil, 13, nil, nil, nil, nil,
|
||||
nil, nil, nil, nil, nil, nil, 2, nil, nil, nil,
|
||||
nil, nil, nil, nil, nil, nil, 3 ]
|
||||
|
||||
racc_goto_pointer = [
|
||||
nil, 6, 1, -2, nil, nil, nil, nil, -12, nil,
|
||||
nil, -9, -8, -11 ]
|
||||
|
||||
racc_goto_default = [
|
||||
nil, nil, nil, 3, 4, 5, 6, 7, 8, 9,
|
||||
13, nil, nil, nil ]
|
||||
|
||||
racc_reduce_table = [
|
||||
0, 0, :racc_error,
|
||||
1, 19, :_reduce_1,
|
||||
0, 19, :_reduce_2,
|
||||
2, 20, :_reduce_3,
|
||||
1, 20, :_reduce_4,
|
||||
0, 20, :_reduce_5,
|
||||
1, 21, :_reduce_none,
|
||||
1, 21, :_reduce_none,
|
||||
1, 21, :_reduce_none,
|
||||
1, 21, :_reduce_none,
|
||||
1, 21, :_reduce_none,
|
||||
1, 21, :_reduce_none,
|
||||
2, 22, :_reduce_12,
|
||||
3, 22, :_reduce_13,
|
||||
4, 22, :_reduce_14,
|
||||
5, 22, :_reduce_15,
|
||||
2, 23, :_reduce_16,
|
||||
3, 23, :_reduce_17,
|
||||
2, 24, :_reduce_18,
|
||||
3, 24, :_reduce_19,
|
||||
4, 25, :_reduce_20,
|
||||
2, 28, :_reduce_21,
|
||||
3, 28, :_reduce_22,
|
||||
1, 29, :_reduce_23,
|
||||
0, 29, :_reduce_24,
|
||||
2, 30, :_reduce_25,
|
||||
1, 30, :_reduce_26,
|
||||
1, 31, :_reduce_27,
|
||||
2, 31, :_reduce_28,
|
||||
2, 27, :_reduce_29,
|
||||
3, 27, :_reduce_30,
|
||||
1, 26, :_reduce_31 ]
|
||||
|
||||
racc_reduce_n = 32
|
||||
|
||||
racc_shift_n = 47
|
||||
|
||||
racc_token_table = {
|
||||
false => 0,
|
||||
:error => 1,
|
||||
:T_STRING => 2,
|
||||
:T_TEXT => 3,
|
||||
:T_DOCTYPE_START => 4,
|
||||
:T_DOCTYPE_END => 5,
|
||||
:T_DOCTYPE_TYPE => 6,
|
||||
:T_CDATA_START => 7,
|
||||
:T_CDATA_END => 8,
|
||||
:T_COMMENT_START => 9,
|
||||
:T_COMMENT_END => 10,
|
||||
:T_ELEM_START => 11,
|
||||
:T_ELEM_NAME => 12,
|
||||
:T_ELEM_NS => 13,
|
||||
:T_ELEM_END => 14,
|
||||
:T_ATTR => 15,
|
||||
:T_XML_DECL_START => 16,
|
||||
:T_XML_DECL_END => 17 }
|
||||
|
||||
racc_nt_base = 18
|
||||
|
||||
racc_use_result_var = false
|
||||
|
||||
Racc_arg = [
|
||||
racc_action_table,
|
||||
racc_action_check,
|
||||
racc_action_default,
|
||||
racc_action_pointer,
|
||||
racc_goto_table,
|
||||
racc_goto_check,
|
||||
racc_goto_default,
|
||||
racc_goto_pointer,
|
||||
racc_nt_base,
|
||||
racc_reduce_table,
|
||||
racc_token_table,
|
||||
racc_shift_n,
|
||||
racc_reduce_n,
|
||||
racc_use_result_var ]
|
||||
|
||||
Racc_token_to_s_table = [
|
||||
"$end",
|
||||
"error",
|
||||
"T_STRING",
|
||||
"T_TEXT",
|
||||
"T_DOCTYPE_START",
|
||||
"T_DOCTYPE_END",
|
||||
"T_DOCTYPE_TYPE",
|
||||
"T_CDATA_START",
|
||||
"T_CDATA_END",
|
||||
"T_COMMENT_START",
|
||||
"T_COMMENT_END",
|
||||
"T_ELEM_START",
|
||||
"T_ELEM_NAME",
|
||||
"T_ELEM_NS",
|
||||
"T_ELEM_END",
|
||||
"T_ATTR",
|
||||
"T_XML_DECL_START",
|
||||
"T_XML_DECL_END",
|
||||
"$start",
|
||||
"document",
|
||||
"expressions",
|
||||
"expression",
|
||||
"doctype",
|
||||
"cdata",
|
||||
"comment",
|
||||
"element",
|
||||
"text",
|
||||
"xmldecl",
|
||||
"element_open",
|
||||
"attributes",
|
||||
"attributes_",
|
||||
"attribute" ]
|
||||
|
||||
Racc_debug_parser = false
|
||||
|
||||
##### State transition tables end #####
|
||||
|
||||
# reduce 0 omitted
|
||||
|
||||
def _reduce_1(val, _values)
|
||||
s(:document, val[0])
|
||||
end
|
||||
|
||||
def _reduce_2(val, _values)
|
||||
s(:document)
|
||||
end
|
||||
|
||||
def _reduce_3(val, _values)
|
||||
val.compact
|
||||
end
|
||||
|
||||
def _reduce_4(val, _values)
|
||||
val[0]
|
||||
end
|
||||
|
||||
def _reduce_5(val, _values)
|
||||
nil
|
||||
end
|
||||
|
||||
# reduce 6 omitted
|
||||
|
||||
# reduce 7 omitted
|
||||
|
||||
# reduce 8 omitted
|
||||
|
||||
# reduce 9 omitted
|
||||
|
||||
# reduce 10 omitted
|
||||
|
||||
# reduce 11 omitted
|
||||
|
||||
def _reduce_12(val, _values)
|
||||
s(:doctype)
|
||||
end
|
||||
|
||||
def _reduce_13(val, _values)
|
||||
s(:doctype, val[1])
|
||||
|
||||
end
|
||||
|
||||
def _reduce_14(val, _values)
|
||||
s(:doctype, val[1], val[2])
|
||||
|
||||
end
|
||||
|
||||
def _reduce_15(val, _values)
|
||||
s(:doctype, val[1], val[2], val[3])
|
||||
|
||||
end
|
||||
|
||||
def _reduce_16(val, _values)
|
||||
s(:cdata)
|
||||
end
|
||||
|
||||
def _reduce_17(val, _values)
|
||||
s(:cdata, val[1])
|
||||
end
|
||||
|
||||
def _reduce_18(val, _values)
|
||||
s(:comment)
|
||||
end
|
||||
|
||||
def _reduce_19(val, _values)
|
||||
s(:comment, val[1])
|
||||
end
|
||||
|
||||
def _reduce_20(val, _values)
|
||||
s(:element, val[0], val[1], val[2])
|
||||
|
||||
end
|
||||
|
||||
def _reduce_21(val, _values)
|
||||
[nil, val[1]]
|
||||
end
|
||||
|
||||
def _reduce_22(val, _values)
|
||||
[val[1], val[2]]
|
||||
end
|
||||
|
||||
def _reduce_23(val, _values)
|
||||
s(:attributes, val[0])
|
||||
end
|
||||
|
||||
def _reduce_24(val, _values)
|
||||
nil
|
||||
end
|
||||
|
||||
def _reduce_25(val, _values)
|
||||
val
|
||||
end
|
||||
|
||||
def _reduce_26(val, _values)
|
||||
val
|
||||
end
|
||||
|
||||
def _reduce_27(val, _values)
|
||||
s(:attribute, val[0])
|
||||
end
|
||||
|
||||
def _reduce_28(val, _values)
|
||||
s(:attribute, val[0], val[1])
|
||||
end
|
||||
|
||||
def _reduce_29(val, _values)
|
||||
s(:xml_decl)
|
||||
end
|
||||
|
||||
def _reduce_30(val, _values)
|
||||
s(:xml_decl, val[1])
|
||||
end
|
||||
|
||||
def _reduce_31(val, _values)
|
||||
s(:text, val[0])
|
||||
end
|
||||
|
||||
def _reduce_none(val, _values)
|
||||
val[0]
|
||||
end
|
||||
|
||||
end # class Parser
|
||||
end # module XML
|
||||
end # module Oga
|
|
@ -5,9 +5,9 @@
|
|||
# It requires every tag to have a closing tag. As such you'll need to enable
|
||||
# HTML parsing mode when parsing HTML. This can be done as following:
|
||||
#
|
||||
# parser = Oga::Parser.new(:html => true)
|
||||
# parser = Oga::XML::Parser.new(:html => true)
|
||||
#
|
||||
class Oga::Parser
|
||||
class Oga::XML::Parser
|
||||
|
||||
token T_STRING T_TEXT
|
||||
token T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE
|
|
@ -1,6 +1,6 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::Lexer do
|
||||
describe Oga::XML::Lexer do
|
||||
context 'cdata tags' do
|
||||
example 'lex a cdata tag' do
|
||||
lex('<![CDATA[foo]]>').should == [
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::Lexer do
|
||||
describe Oga::XML::Lexer do
|
||||
context 'comments' do
|
||||
example 'lex a comment' do
|
||||
lex('<!-- foo -->').should == [
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::Lexer do
|
||||
describe Oga::XML::Lexer do
|
||||
context 'doctypes' do
|
||||
example 'lex the HTML5 doctype' do
|
||||
lex('<!DOCTYPE html>').should == [
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::Lexer do
|
||||
describe Oga::XML::Lexer do
|
||||
context 'HTML documents' do
|
||||
example 'lex a basic HTML document' do
|
||||
html = <<-EOF
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::Lexer do
|
||||
describe Oga::XML::Lexer do
|
||||
context 'elements' do
|
||||
example 'lex an opening element' do
|
||||
lex('<p>').should == [
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::Lexer do
|
||||
describe Oga::XML::Lexer do
|
||||
context 'regular text' do
|
||||
example 'lex regular text' do
|
||||
lex('hello').should == [[:T_TEXT, 'hello', 1]]
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::Lexer do
|
||||
describe Oga::XML::Lexer do
|
||||
context 'HTML void elements' do
|
||||
example 'lex a void element that omits the closing /' do
|
||||
lex('<link>', :html => true).should == [
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::Lexer do
|
||||
describe Oga::XML::Lexer do
|
||||
context 'XML declaration tags' do
|
||||
example 'lex a start tag' do
|
||||
lex('<?xml').should == [[:T_XML_DECL_START, nil, 1]]
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::Parser do
|
||||
describe Oga::XML::Parser do
|
||||
context 'cdata tags' do
|
||||
example 'parse a cdata tag' do
|
||||
parse('<![CDATA[foo]]>').should == s(:document, s(:cdata, 'foo'))
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::Parser do
|
||||
describe Oga::XML::Parser do
|
||||
context 'comments' do
|
||||
example 'parse an empty comment' do
|
||||
parse('<!---->').should == s(:document, s(:comment))
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::Parser do
|
||||
describe Oga::XML::Parser do
|
||||
context 'doctypes' do
|
||||
example 'parse a doctype' do
|
||||
parse('<!DOCTYPE html>').should == s(:document, s(:doctype))
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::Parser do
|
||||
describe Oga::XML::Parser do
|
||||
context 'HTML documents' do
|
||||
example 'parse a basic HTML document' do
|
||||
html = <<-EOF
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::Parser do
|
||||
describe Oga::XML::Parser do
|
||||
context 'elements' do
|
||||
example 'parse an empty element' do
|
||||
parse('<p></p>').should == s(
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::Parser do
|
||||
describe Oga::XML::Parser do
|
||||
example 'parse regular text' do
|
||||
parse('foo').should == s(:document, s(:text, 'foo'))
|
||||
end
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::Parser do
|
||||
describe Oga::XML::Parser do
|
||||
context 'HTML void elements' do
|
||||
example 'parse a void element that omits the closing /' do
|
||||
parse('<link>', :html => true).should == s(
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::Parser do
|
||||
describe Oga::XML::Parser do
|
||||
context 'XML declaration tags' do
|
||||
example 'lex an XML declaration tag' do
|
||||
parse('<?xml hello ?>').should == s(
|
||||
|
|
|
@ -19,7 +19,7 @@ module Oga
|
|||
# @return [Array]
|
||||
#
|
||||
def lex(input, options = {})
|
||||
return Oga::Lexer.new(options).lex(input)
|
||||
return Oga::XML::Lexer.new(options).lex(input)
|
||||
end
|
||||
|
||||
##
|
||||
|
@ -30,7 +30,7 @@ module Oga
|
|||
# @return [Oga::AST::Node]
|
||||
#
|
||||
def parse(input, options = {})
|
||||
return Oga::Parser.new(options).parse(input)
|
||||
return Oga::XML::Parser.new(options).parse(input)
|
||||
end
|
||||
end # ParsingHelpers
|
||||
end # Oga
|
||||
|
|
Loading…
Reference in New Issue