Basic lexing of HTML tags.
The current implementation is a bit messy. In particular the counting of column numbers is not entirely the way it should be. There are also some problems with nested tags/text that I still have to resolve.
This commit is contained in:
parent
d9ef33e1f8
commit
a5a3b8db3f
|
@ -79,6 +79,8 @@ module Oga
|
|||
end
|
||||
|
||||
def emit_text_buffer
|
||||
return if @text_buffer.empty?
|
||||
|
||||
add_token(:T_TEXT, @text_buffer)
|
||||
|
||||
@text_buffer = ''
|
||||
|
@ -98,12 +100,8 @@ module Oga
|
|||
newline = '\n' | '\r\n';
|
||||
whitespace = [ \t];
|
||||
|
||||
action emit_space {
|
||||
t(:T_SPACE)
|
||||
}
|
||||
|
||||
action emit_newline {
|
||||
t(:T_NEWLINE)
|
||||
t(:T_TEXT)
|
||||
advance_line
|
||||
}
|
||||
|
||||
|
@ -228,9 +226,66 @@ module Oga
|
|||
any => buffer_text;
|
||||
*|;
|
||||
|
||||
# Elements
|
||||
#
|
||||
# http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
|
||||
#
|
||||
element_name = [a-zA-Z0-9\-_]+;
|
||||
element_start = '<' element_name;
|
||||
|
||||
# First emit the token, then advance the column. This way the column
|
||||
# number points to the < and not the "p" in <p>.
|
||||
action open_element {
|
||||
t(:T_ELEM_OPEN, p)
|
||||
|
||||
advance_column
|
||||
|
||||
fcall element;
|
||||
}
|
||||
|
||||
element_text := |*
|
||||
^'<' => buffer_text;
|
||||
|
||||
'<' => {
|
||||
emit_text_buffer
|
||||
fhold;
|
||||
fret;
|
||||
};
|
||||
*|;
|
||||
|
||||
element := |*
|
||||
whitespace => { advance_column };
|
||||
|
||||
element_start => open_element;
|
||||
|
||||
# Consume the text inside the element.
|
||||
'>' => {
|
||||
advance_column
|
||||
fcall element_text;
|
||||
};
|
||||
|
||||
# Attributes and their values.
|
||||
element_name
|
||||
%{
|
||||
t(:T_ATTR, @ts, p)
|
||||
}
|
||||
'=' (dquote @string_dquote | squote @string_squote);
|
||||
|
||||
# Non self-closing tags.
|
||||
'</' element_name {
|
||||
emit_text_buffer
|
||||
t(:T_ELEM_CLOSE, p)
|
||||
|
||||
# Advance by two to take the closing </ into account. This is done
|
||||
# after emitting tokens to ensure that they point to the start of
|
||||
# the tag.
|
||||
advance_column(2)
|
||||
fret;
|
||||
};
|
||||
*|;
|
||||
|
||||
main := |*
|
||||
whitespace => emit_space;
|
||||
newline => emit_newline;
|
||||
newline => emit_newline;
|
||||
|
||||
doctype_start => {
|
||||
t(:T_DOCTYPE_START)
|
||||
|
@ -247,19 +302,10 @@ module Oga
|
|||
fcall comment;
|
||||
};
|
||||
|
||||
# General rules and actions.
|
||||
'<' => { t(:T_SMALLER) };
|
||||
'>' => { t(:T_GREATER) };
|
||||
'/' => { t(:T_SLASH) };
|
||||
'-' => { t(:T_DASH) };
|
||||
']' => { t(:T_RBRACKET) };
|
||||
'[' => { t(:T_LBRACKET) };
|
||||
':' => { t(:T_COLON) };
|
||||
'!' => { t(:T_BANG) };
|
||||
'=' => { t(:T_EQUALS) };
|
||||
element_start => open_element;
|
||||
|
||||
dquote => { t(:T_DQUOTE) };
|
||||
squote => { t(:T_SQUOTE) };
|
||||
#dquote => { t(:T_DQUOTE) };
|
||||
#squote => { t(:T_SQUOTE) };
|
||||
*|;
|
||||
}%%
|
||||
end # Lexer
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::Lexer do
|
||||
context 'elements' do
|
||||
example 'lex an opening element' do
|
||||
lex('<p>').should == [
|
||||
[:T_ELEM_OPEN, 'p', 1, 1]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex an opening an closing element' do
|
||||
lex('<p></p>').should == [
|
||||
[:T_ELEM_OPEN, 'p', 1, 1],
|
||||
[:T_ELEM_CLOSE, 'p', 1, 4]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex a paragraph element with text inside it' do
|
||||
lex('<p>Hello</p>').should == [
|
||||
[:T_ELEM_OPEN, 'p', 1, 1],
|
||||
[:T_TEXT, 'Hello', 1, 4],
|
||||
[:T_ELEM_CLOSE, 'p', 1, 9]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex a paragraph element with attributes' do
|
||||
lex('<p class="foo">Hello</p>').should == [
|
||||
[:T_ELEM_OPEN, 'p', 1, 1],
|
||||
[:T_ATTR, 'class', 1, 4],
|
||||
[:T_STRING, 'foo', 1, 10],
|
||||
[:T_TEXT, 'Hello', 1, 15],
|
||||
[:T_ELEM_CLOSE, 'p', 1, 20]
|
||||
]
|
||||
end
|
||||
end
|
||||
|
||||
context 'nested elements' do
|
||||
example 'lex a nested element' do
|
||||
lex('<p><a></a></p>').should == [
|
||||
[:T_ELEM_OPEN, 'p', 1, 1],
|
||||
[:T_ELEM_OPEN, 'a', 1, 4],
|
||||
[:T_ELEM_CLOSE, 'a', 1, 7],
|
||||
[:T_ELEM_CLOSE, 'p', 1, 11]
|
||||
]
|
||||
end
|
||||
end
|
||||
end
|
|
@ -9,24 +9,17 @@ describe Oga::Lexer do
|
|||
|
||||
context 'whitespace' do
|
||||
example 'lex regular whitespace' do
|
||||
lex(' ').should == [[:T_SPACE, ' ', 1, 1]]
|
||||
lex(' ').should == [[:T_TEXT, ' ', 1, 1]]
|
||||
end
|
||||
|
||||
example 'lex a newline' do
|
||||
lex("\n").should == [[:T_NEWLINE, "\n", 1, 1]]
|
||||
end
|
||||
|
||||
example 'advance column numbers for spaces' do
|
||||
lex(' ').should == [
|
||||
[:T_SPACE, ' ', 1, 1],
|
||||
[:T_SPACE, ' ', 1, 2]
|
||||
]
|
||||
lex("\n").should == [[:T_TEXT, "\n", 1, 1]]
|
||||
end
|
||||
|
||||
example 'advance line numbers for newlines' do
|
||||
lex("\n ").should == [
|
||||
[:T_NEWLINE, "\n", 1, 1],
|
||||
[:T_SPACE, ' ', 2, 1]
|
||||
[:T_TEXT, "\n", 1, 1],
|
||||
[:T_TEXT, ' ', 2, 1]
|
||||
]
|
||||
end
|
||||
end
|
||||
|
|
|
@ -1,72 +0,0 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Oga::Lexer do
|
||||
context 'tags' do
|
||||
example 'lex an opening tag' do
|
||||
lex('<p>').should == [
|
||||
[:T_SMALLER, '<', 1, 1],
|
||||
[:T_TEXT, 'p', 1, 2],
|
||||
[:T_GREATER, '>', 1, 3]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex an opening tag with an attribute' do
|
||||
lex('<p title="Foo">').should == [
|
||||
[:T_SMALLER, '<', 1, 1],
|
||||
[:T_TEXT, 'p', 1, 2],
|
||||
[:T_SPACE, ' ', 1, 3],
|
||||
[:T_TEXT, 'title', 1, 4],
|
||||
[:T_EQUALS, '=', 1, 9],
|
||||
[:T_DQUOTE, '"', 1, 10],
|
||||
[:T_TEXT, 'Foo', 1, 11],
|
||||
[:T_DQUOTE, '"', 1, 14],
|
||||
[:T_GREATER, '>', 1, 15]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex a tag with text inside it' do
|
||||
lex('<p>Foo</p>').should == [
|
||||
[:T_SMALLER, '<', 1, 1],
|
||||
[:T_TEXT, 'p', 1, 2],
|
||||
[:T_GREATER, '>', 1, 3],
|
||||
[:T_TEXT, 'Foo', 1, 4],
|
||||
[:T_SMALLER, '<', 1, 7],
|
||||
[:T_SLASH, '/', 1, 8],
|
||||
[:T_TEXT, 'p', 1, 9],
|
||||
[:T_GREATER, '>', 1, 10]
|
||||
]
|
||||
end
|
||||
|
||||
example 'lex a tag with an attribute with a dash in it' do
|
||||
lex('<p foo-bar="baz">').should == [
|
||||
[:T_SMALLER, '<', 1, 1],
|
||||
[:T_TEXT, 'p', 1, 2],
|
||||
[:T_SPACE, ' ', 1, 3],
|
||||
[:T_TEXT, 'foo', 1, 4],
|
||||
[:T_DASH, '-', 1, 7],
|
||||
[:T_TEXT, 'bar', 1, 8],
|
||||
[:T_EQUALS, '=', 1, 11],
|
||||
[:T_DQUOTE, '"', 1, 12],
|
||||
[:T_TEXT, 'baz', 1, 13],
|
||||
[:T_DQUOTE, '"', 1, 16],
|
||||
[:T_GREATER, '>', 1, 17]
|
||||
]
|
||||
end
|
||||
end
|
||||
|
||||
context 'tags with namespaces' do
|
||||
example 'lex a tag with a dummy namespace' do
|
||||
lex('<foo:p></p>').should == [
|
||||
[:T_SMALLER, '<', 1, 1],
|
||||
[:T_TEXT, 'foo', 1, 2],
|
||||
[:T_COLON, ':', 1, 5],
|
||||
[:T_TEXT, 'p', 1, 6],
|
||||
[:T_GREATER, '>', 1, 7],
|
||||
[:T_SMALLER, '<', 1, 8],
|
||||
[:T_SLASH, '/', 1, 9],
|
||||
[:T_TEXT, 'p', 1, 10],
|
||||
[:T_GREATER, '>', 1, 11]
|
||||
]
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue