Reworked token generation for elements.

This emits separate tokens for the start tag (T_ELEMENT_OPEN) and name
(T_ELEMENT_NAME). This makes it easier to include the namespace of an element
(T_ELEMENT_NS) in the output.
This commit is contained in:
Yorick Peterse 2014-03-10 23:50:39 +01:00
parent cd53d5e426
commit eacd9b88cf
2 changed files with 65 additions and 30 deletions

View File

@ -230,19 +230,35 @@ module Oga
#
# http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
#
element_name = [a-zA-Z0-9\-_]+;
element_start = '<' element_name;
# First emit the token, then advance the column. This way the column
# number points to the < and not the "p" in <p>.
# Action that creates the tokens for the opening tag, name and namespace
# (if any). Remaining work is delegated to a dedicated machine.
action open_element {
t(:T_ELEM_OPEN, @ts + 1)
add_token(:T_ELEM_OPEN, nil)
advance_column
# Add the element name. If the name includes a namespace we'll break
# the name up into two separate tokens.
name = text(@ts + 1)
if name.include?(':')
ns, name = name.split(':')
add_token(:T_ELEM_NS, ns)
# Advance the column for the colon (:) that separates the namespace
# and element name.
advance_column
end
add_token(:T_ELEM_NAME, name)
fcall element;
}
element_name = [a-zA-Z0-9\-_:]+;
element_start = '<' element_name;
element_text := |*
^'<' => buffer_text;
@ -275,12 +291,13 @@ module Oga
# Non self-closing elements.
'</' element_name {
emit_text_buffer
t(:T_ELEM_CLOSE, p)
add_token(:T_ELEM_CLOSE, nil)
# Advance by two to take the closing </ into account. This is done
# after emitting tokens to ensure that they point to the start of
# the tag.
# Advance the column for the </
advance_column(2)
# Advance the column for the closing name.
advance_column(@te - p)
fret;
};
@ -311,9 +328,6 @@ module Oga
};
element_start => open_element;
#dquote => { t(:T_DQUOTE) };
#squote => { t(:T_SQUOTE) };
*|;
}%%
end # Lexer

View File

@ -4,32 +4,36 @@ describe Oga::Lexer do
context 'elements' do
example 'lex an opening element' do
lex('<p>').should == [
[:T_ELEM_OPEN, 'p', 1, 1]
[:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NAME, 'p', 1, 2]
]
end
example 'lex an opening an closing element' do
lex('<p></p>').should == [
[:T_ELEM_OPEN, 'p', 1, 1],
[:T_ELEM_CLOSE, 'p', 1, 4]
[:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NAME, 'p', 1, 2],
[:T_ELEM_CLOSE, nil, 1, 4]
]
end
example 'lex a paragraph element with text inside it' do
lex('<p>Hello</p>').should == [
[:T_ELEM_OPEN, 'p', 1, 1],
[:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NAME, 'p', 1, 2],
[:T_TEXT, 'Hello', 1, 4],
[:T_ELEM_CLOSE, 'p', 1, 9]
[:T_ELEM_CLOSE, nil, 1, 9]
]
end
example 'lex a paragraph element with attributes' do
lex('<p class="foo">Hello</p>').should == [
[:T_ELEM_OPEN, 'p', 1, 1],
[:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NAME, 'p', 1, 2],
[:T_ATTR, 'class', 1, 4],
[:T_STRING, 'foo', 1, 10],
[:T_TEXT, 'Hello', 1, 16],
[:T_ELEM_CLOSE, 'p', 1, 21]
[:T_ELEM_CLOSE, nil, 1, 21]
]
end
end
@ -37,22 +41,26 @@ describe Oga::Lexer do
context 'nested elements' do
example 'lex a nested element' do
lex('<p><a></a></p>').should == [
[:T_ELEM_OPEN, 'p', 1, 1],
[:T_ELEM_OPEN, 'a', 1, 4],
[:T_ELEM_CLOSE, 'a', 1, 7],
[:T_ELEM_CLOSE, 'p', 1, 11]
[:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NAME, 'p', 1, 2],
[:T_ELEM_OPEN, nil, 1, 4],
[:T_ELEM_NAME, 'a', 1, 5],
[:T_ELEM_CLOSE, nil, 1, 7],
[:T_ELEM_CLOSE, nil, 1, 11]
]
end
example 'lex nested elements and text nodes' do
lex('<p>Foo<a>bar</a>baz</p>').should == [
[:T_ELEM_OPEN, 'p', 1, 1],
[:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NAME, 'p', 1, 2],
[:T_TEXT, 'Foo', 1, 4],
[:T_ELEM_OPEN, 'a', 1, 7],
[:T_ELEM_OPEN, nil, 1, 7],
[:T_ELEM_NAME, 'a', 1, 8],
[:T_TEXT, 'bar', 1, 10],
[:T_ELEM_CLOSE, 'a', 1, 13],
[:T_ELEM_CLOSE, nil, 1, 13],
[:T_TEXT, 'baz', 1, 17],
[:T_ELEM_CLOSE, 'p', 1, 20]
[:T_ELEM_CLOSE, nil, 1, 20]
]
end
end
@ -60,18 +68,31 @@ describe Oga::Lexer do
context 'void elements' do
example 'lex a void element' do
lex('<br />').should == [
[:T_ELEM_OPEN, 'br', 1, 1],
[:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NAME, 'br', 1, 2],
[:T_ELEM_CLOSE, nil, 1, 6]
]
end
example 'lex a void element with an attribute' do
lex('<br class="foo" />').should == [
[:T_ELEM_OPEN, 'br', 1, 1],
[:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NAME, 'br', 1, 2],
[:T_ATTR, 'class', 1, 5],
[:T_STRING, 'foo', 1, 11],
[:T_ELEM_CLOSE, nil, 1, 18]
]
end
end
context 'elements with namespaces' do
example 'lex an element with namespaces' do
lex('<foo:p></p>').should == [
[:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NS, 'foo', 1, 2],
[:T_ELEM_NAME, 'p', 1, 6],
[:T_ELEM_CLOSE, nil, 1, 8]
]
end
end
end