Removed the buffering crap from the lexer.

This commit is contained in:
Yorick Peterse 2014-05-04 17:39:08 +02:00
parent 57255012b7
commit f18e8893de
1 changed files with 25 additions and 67 deletions

View File

@ -216,26 +216,21 @@ module Oga
end end
## ##
# Emits the current buffer if we have any. The current line number is # Emits a text token.
# advanced based on the amount of newlines in the buffer.
# #
# @param [Fixnum] position The end position of the buffer. # @param [Fixnum] start
# @param [Symbol] type The type of node to emit. # @param [Fixnum] stop
# #
def emit_buffer(position, type = :T_TEXT) def emit_text(start, stop)
return unless @buffer_start_position content = text(start, stop)
content = text(@buffer_start_position, position)
unless content.empty? unless content.empty?
add_token(type, content) add_token(:T_TEXT, content)
lines = content.count("\n") lines = content.count("\n")
advance_line(lines) if lines > 0 advance_line(lines) if lines > 0
end end
@buffer_start_position = nil
end end
## ##
@ -262,36 +257,22 @@ module Oga
dquote = '"'; dquote = '"';
squote = "'"; squote = "'";
action start_string_dquote {
start_buffer(te)
fcall string_dquote;
}
action start_string_squote {
start_buffer(te)
fcall string_squote;
}
# Machine for processing double quoted strings. # Machine for processing double quoted strings.
string_dquote := |* string_dquote := |*
dquote => { ^dquote+ => {
emit_buffer(ts, :T_STRING) emit(:T_STRING, ts, te)
fret;
}; };
any; dquote => { fret; };
*|; *|;
# Machine for processing single quoted strings. # Machine for processing single quoted strings.
string_squote := |* string_squote := |*
squote => { ^squote+ => {
emit_buffer(ts, :T_STRING) emit(:T_STRING, ts, te)
fret;
}; };
any; squote => { fret; };
*|; *|;
# DOCTYPES # DOCTYPES
@ -307,7 +288,6 @@ module Oga
doctype_start = '<!DOCTYPE'i whitespace+; doctype_start = '<!DOCTYPE'i whitespace+;
action start_doctype { action start_doctype {
emit_buffer(ts)
add_token(:T_DOCTYPE_START) add_token(:T_DOCTYPE_START)
fcall doctype; fcall doctype;
} }
@ -318,8 +298,8 @@ module Oga
'PUBLIC' | 'SYSTEM' => { emit(:T_DOCTYPE_TYPE, ts, te) }; 'PUBLIC' | 'SYSTEM' => { emit(:T_DOCTYPE_TYPE, ts, te) };
# Lex the public/system IDs as regular strings. # Lex the public/system IDs as regular strings.
dquote => start_string_dquote; dquote => { fcall string_dquote; };
squote => start_string_squote; squote => { fcall string_squote; };
# Whitespace inside doctypes is ignored since there's no point in # Whitespace inside doctypes is ignored since there's no point in
# including it. # including it.
@ -347,25 +327,20 @@ module Oga
cdata_end = ']]>'; cdata_end = ']]>';
action start_cdata { action start_cdata {
emit_buffer(ts)
add_token(:T_CDATA_START) add_token(:T_CDATA_START)
start_buffer(te)
fcall cdata; fcall cdata;
} }
# Machine that for processing the contents of CDATA tags. Everything # Machine that for processing the contents of CDATA tags. Everything
# inside a CDATA tag is treated as plain text. # inside a CDATA tag is treated as plain text.
cdata := |* cdata := |*
cdata_end => { any* cdata_end => {
emit_buffer(ts) emit_text(ts, te - 3)
add_token(:T_CDATA_END) add_token(:T_CDATA_END)
fret; fret;
}; };
any;
*|; *|;
# Comments # Comments
@ -383,25 +358,20 @@ module Oga
comment_end = '-->'; comment_end = '-->';
action start_comment { action start_comment {
emit_buffer(ts)
add_token(:T_COMMENT_START) add_token(:T_COMMENT_START)
start_buffer(te)
fcall comment; fcall comment;
} }
# Machine used for processing the contents of a comment. Everything # Machine used for processing the contents of a comment. Everything
# inside a comment is treated as plain text (similar to CDATA tags). # inside a comment is treated as plain text (similar to CDATA tags).
comment := |* comment := |*
comment_end => { any* comment_end => {
emit_buffer(ts) emit_text(ts, te - 3)
add_token(:T_COMMENT_END) add_token(:T_COMMENT_END)
fret; fret;
}; };
any;
*|; *|;
# XML declaration tags # XML declaration tags
@ -412,18 +382,14 @@ module Oga
xml_decl_end = '?>'; xml_decl_end = '?>';
action start_xml_decl { action start_xml_decl {
emit_buffer(ts)
add_token(:T_XML_DECL_START) add_token(:T_XML_DECL_START)
start_buffer(te)
fcall xml_decl; fcall xml_decl;
} }
# Machine that processes the contents of an XML declaration tag. # Machine that processes the contents of an XML declaration tag.
xml_decl := |* xml_decl := |*
xml_decl_end => { xml_decl_end => {
emit_buffer(ts)
add_token(:T_XML_DECL_END) add_token(:T_XML_DECL_END)
fret; fret;
@ -432,8 +398,8 @@ module Oga
# Attributes and their values (e.g. version="1.0"). # Attributes and their values (e.g. version="1.0").
identifier => { emit(:T_ATTR, ts, te) }; identifier => { emit(:T_ATTR, ts, te) };
dquote => start_string_dquote; dquote => { fcall string_dquote; };
squote => start_string_squote; squote => { fcall string_squote; };
any; any;
*|; *|;
@ -447,7 +413,6 @@ module Oga
# namespace (if any). Remaining work is delegated to a dedicated # namespace (if any). Remaining work is delegated to a dedicated
# machine. # machine.
action start_element { action start_element {
emit_buffer(ts)
add_token(:T_ELEM_START) add_token(:T_ELEM_START)
# Add the element name. If the name includes a namespace we'll break # Add the element name. If the name includes a namespace we'll break
@ -484,8 +449,8 @@ module Oga
identifier => { emit(:T_ATTR, ts, te) }; identifier => { emit(:T_ATTR, ts, te) };
# Attribute values. # Attribute values.
dquote => start_string_dquote; dquote => { fcall string_dquote; };
squote => start_string_squote; squote => { fcall string_squote; };
# The closing character of the open tag. # The closing character of the open tag.
('>' | '/') => { ('>' | '/') => {
@ -512,7 +477,6 @@ module Oga
# Regular closing tags. # Regular closing tags.
'</' identifier '>' => { '</' identifier '>' => {
emit_buffer(ts)
add_token(:T_ELEM_END, nil) add_token(:T_ELEM_END, nil)
@elements.pop if html? @elements.pop if html?
@ -527,14 +491,8 @@ module Oga
# Note that this rule should be declared at the very bottom as it # Note that this rule should be declared at the very bottom as it
# will otherwise take precedence over the other rules. # will otherwise take precedence over the other rules.
any => { ^('<' | '>')+ => {
# First character, start buffering (unless we already are buffering). emit_text(ts, te)
start_buffer(ts) unless @buffer_start_position
# EOF, emit the text buffer.
if te == eof
emit_buffer(te)
end
}; };
*|; *|;
}%% }%%