Basic lexing of HTML tags.

The current implementation is a bit messy. In particular the counting of column
numbers is not entirely the way it should be. There are also some problems with
nested tags/text that I still have to resolve.
This commit is contained in:
Yorick Peterse 2014-03-03 22:08:46 +01:00
parent d9ef33e1f8
commit a5a3b8db3f
4 changed files with 116 additions and 102 deletions

View File

@ -79,6 +79,8 @@ module Oga
end
def emit_text_buffer
return if @text_buffer.empty?
add_token(:T_TEXT, @text_buffer)
@text_buffer = ''
@ -98,12 +100,8 @@ module Oga
newline = '\n' | '\r\n';
whitespace = [ \t];
action emit_space {
t(:T_SPACE)
}
action emit_newline {
t(:T_NEWLINE)
t(:T_TEXT)
advance_line
}
@ -228,8 +226,65 @@ module Oga
any => buffer_text;
*|;
# Elements
#
# http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
#
element_name = [a-zA-Z0-9\-_]+;
element_start = '<' element_name;
# First emit the token, then advance the column. This way the column
# number points to the < and not the "p" in <p>.
action open_element {
t(:T_ELEM_OPEN, p)
advance_column
fcall element;
}
element_text := |*
^'<' => buffer_text;
'<' => {
emit_text_buffer
fhold;
fret;
};
*|;
element := |*
whitespace => { advance_column };
element_start => open_element;
# Consume the text inside the element.
'>' => {
advance_column
fcall element_text;
};
# Attributes and their values.
element_name
%{
t(:T_ATTR, @ts, p)
}
'=' (dquote @string_dquote | squote @string_squote);
# Non self-closing tags.
'</' element_name {
emit_text_buffer
t(:T_ELEM_CLOSE, p)
# Advance by two to take the closing </ into account. This is done
# after emitting tokens to ensure that they point to the start of
# the tag.
advance_column(2)
fret;
};
*|;
main := |*
whitespace => emit_space;
newline => emit_newline;
doctype_start => {
@ -247,19 +302,10 @@ module Oga
fcall comment;
};
# General rules and actions.
'<' => { t(:T_SMALLER) };
'>' => { t(:T_GREATER) };
'/' => { t(:T_SLASH) };
'-' => { t(:T_DASH) };
']' => { t(:T_RBRACKET) };
'[' => { t(:T_LBRACKET) };
':' => { t(:T_COLON) };
'!' => { t(:T_BANG) };
'=' => { t(:T_EQUALS) };
element_start => open_element;
dquote => { t(:T_DQUOTE) };
squote => { t(:T_SQUOTE) };
#dquote => { t(:T_DQUOTE) };
#squote => { t(:T_SQUOTE) };
*|;
}%%
end # Lexer

View File

@ -0,0 +1,47 @@
require 'spec_helper'
describe Oga::Lexer do
context 'elements' do
example 'lex an opening element' do
lex('<p>').should == [
[:T_ELEM_OPEN, 'p', 1, 1]
]
end
example 'lex an opening an closing element' do
lex('<p></p>').should == [
[:T_ELEM_OPEN, 'p', 1, 1],
[:T_ELEM_CLOSE, 'p', 1, 4]
]
end
example 'lex a paragraph element with text inside it' do
lex('<p>Hello</p>').should == [
[:T_ELEM_OPEN, 'p', 1, 1],
[:T_TEXT, 'Hello', 1, 4],
[:T_ELEM_CLOSE, 'p', 1, 9]
]
end
example 'lex a paragraph element with attributes' do
lex('<p class="foo">Hello</p>').should == [
[:T_ELEM_OPEN, 'p', 1, 1],
[:T_ATTR, 'class', 1, 4],
[:T_STRING, 'foo', 1, 10],
[:T_TEXT, 'Hello', 1, 15],
[:T_ELEM_CLOSE, 'p', 1, 20]
]
end
end
context 'nested elements' do
example 'lex a nested element' do
lex('<p><a></a></p>').should == [
[:T_ELEM_OPEN, 'p', 1, 1],
[:T_ELEM_OPEN, 'a', 1, 4],
[:T_ELEM_CLOSE, 'a', 1, 7],
[:T_ELEM_CLOSE, 'p', 1, 11]
]
end
end
end

View File

@ -9,24 +9,17 @@ describe Oga::Lexer do
context 'whitespace' do
example 'lex regular whitespace' do
lex(' ').should == [[:T_SPACE, ' ', 1, 1]]
lex(' ').should == [[:T_TEXT, ' ', 1, 1]]
end
example 'lex a newline' do
lex("\n").should == [[:T_NEWLINE, "\n", 1, 1]]
end
example 'advance column numbers for spaces' do
lex(' ').should == [
[:T_SPACE, ' ', 1, 1],
[:T_SPACE, ' ', 1, 2]
]
lex("\n").should == [[:T_TEXT, "\n", 1, 1]]
end
example 'advance line numbers for newlines' do
lex("\n ").should == [
[:T_NEWLINE, "\n", 1, 1],
[:T_SPACE, ' ', 2, 1]
[:T_TEXT, "\n", 1, 1],
[:T_TEXT, ' ', 2, 1]
]
end
end

View File

@ -1,72 +0,0 @@
require 'spec_helper'
describe Oga::Lexer do
context 'tags' do
example 'lex an opening tag' do
lex('<p>').should == [
[:T_SMALLER, '<', 1, 1],
[:T_TEXT, 'p', 1, 2],
[:T_GREATER, '>', 1, 3]
]
end
example 'lex an opening tag with an attribute' do
lex('<p title="Foo">').should == [
[:T_SMALLER, '<', 1, 1],
[:T_TEXT, 'p', 1, 2],
[:T_SPACE, ' ', 1, 3],
[:T_TEXT, 'title', 1, 4],
[:T_EQUALS, '=', 1, 9],
[:T_DQUOTE, '"', 1, 10],
[:T_TEXT, 'Foo', 1, 11],
[:T_DQUOTE, '"', 1, 14],
[:T_GREATER, '>', 1, 15]
]
end
example 'lex a tag with text inside it' do
lex('<p>Foo</p>').should == [
[:T_SMALLER, '<', 1, 1],
[:T_TEXT, 'p', 1, 2],
[:T_GREATER, '>', 1, 3],
[:T_TEXT, 'Foo', 1, 4],
[:T_SMALLER, '<', 1, 7],
[:T_SLASH, '/', 1, 8],
[:T_TEXT, 'p', 1, 9],
[:T_GREATER, '>', 1, 10]
]
end
example 'lex a tag with an attribute with a dash in it' do
lex('<p foo-bar="baz">').should == [
[:T_SMALLER, '<', 1, 1],
[:T_TEXT, 'p', 1, 2],
[:T_SPACE, ' ', 1, 3],
[:T_TEXT, 'foo', 1, 4],
[:T_DASH, '-', 1, 7],
[:T_TEXT, 'bar', 1, 8],
[:T_EQUALS, '=', 1, 11],
[:T_DQUOTE, '"', 1, 12],
[:T_TEXT, 'baz', 1, 13],
[:T_DQUOTE, '"', 1, 16],
[:T_GREATER, '>', 1, 17]
]
end
end
context 'tags with namespaces' do
example 'lex a tag with a dummy namespace' do
lex('<foo:p></p>').should == [
[:T_SMALLER, '<', 1, 1],
[:T_TEXT, 'foo', 1, 2],
[:T_COLON, ':', 1, 5],
[:T_TEXT, 'p', 1, 6],
[:T_GREATER, '>', 1, 7],
[:T_SMALLER, '<', 1, 8],
[:T_SLASH, '/', 1, 9],
[:T_TEXT, 'p', 1, 10],
[:T_GREATER, '>', 1, 11]
]
end
end
end