Reworked token generation for elements.

This emits separate tokens for the start tag (T_ELEMENT_OPEN) and name
(T_ELEMENT_NAME). This makes it easier to include the namespace of an element
(T_ELEMENT_NS) in the output.
This commit is contained in:
Yorick Peterse 2014-03-10 23:50:39 +01:00
parent cd53d5e426
commit eacd9b88cf
2 changed files with 65 additions and 30 deletions

View File

@ -230,19 +230,35 @@ module Oga
# #
# http://www.w3.org/TR/html-markup/syntax.html#syntax-elements # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
# #
element_name = [a-zA-Z0-9\-_]+;
element_start = '<' element_name;
# First emit the token, then advance the column. This way the column # Action that creates the tokens for the opening tag, name and namespace
# number points to the < and not the "p" in <p>. # (if any). Remaining work is delegated to a dedicated machine.
action open_element { action open_element {
t(:T_ELEM_OPEN, @ts + 1) add_token(:T_ELEM_OPEN, nil)
advance_column advance_column
# Add the element name. If the name includes a namespace we'll break
# the name up into two separate tokens.
name = text(@ts + 1)
if name.include?(':')
ns, name = name.split(':')
add_token(:T_ELEM_NS, ns)
# Advance the column for the colon (:) that separates the namespace
# and element name.
advance_column
end
add_token(:T_ELEM_NAME, name)
fcall element; fcall element;
} }
element_name = [a-zA-Z0-9\-_:]+;
element_start = '<' element_name;
element_text := |* element_text := |*
^'<' => buffer_text; ^'<' => buffer_text;
@ -275,12 +291,13 @@ module Oga
# Non self-closing elements. # Non self-closing elements.
'</' element_name { '</' element_name {
emit_text_buffer emit_text_buffer
t(:T_ELEM_CLOSE, p) add_token(:T_ELEM_CLOSE, nil)
# Advance by two to take the closing </ into account. This is done # Advance the column for the </
# after emitting tokens to ensure that they point to the start of
# the tag.
advance_column(2) advance_column(2)
# Advance the column for the closing name.
advance_column(@te - p)
fret; fret;
}; };
@ -311,9 +328,6 @@ module Oga
}; };
element_start => open_element; element_start => open_element;
#dquote => { t(:T_DQUOTE) };
#squote => { t(:T_SQUOTE) };
*|; *|;
}%% }%%
end # Lexer end # Lexer

View File

@ -4,32 +4,36 @@ describe Oga::Lexer do
context 'elements' do context 'elements' do
example 'lex an opening element' do example 'lex an opening element' do
lex('<p>').should == [ lex('<p>').should == [
[:T_ELEM_OPEN, 'p', 1, 1] [:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NAME, 'p', 1, 2]
] ]
end end
example 'lex an opening an closing element' do example 'lex an opening an closing element' do
lex('<p></p>').should == [ lex('<p></p>').should == [
[:T_ELEM_OPEN, 'p', 1, 1], [:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_CLOSE, 'p', 1, 4] [:T_ELEM_NAME, 'p', 1, 2],
[:T_ELEM_CLOSE, nil, 1, 4]
] ]
end end
example 'lex a paragraph element with text inside it' do example 'lex a paragraph element with text inside it' do
lex('<p>Hello</p>').should == [ lex('<p>Hello</p>').should == [
[:T_ELEM_OPEN, 'p', 1, 1], [:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NAME, 'p', 1, 2],
[:T_TEXT, 'Hello', 1, 4], [:T_TEXT, 'Hello', 1, 4],
[:T_ELEM_CLOSE, 'p', 1, 9] [:T_ELEM_CLOSE, nil, 1, 9]
] ]
end end
example 'lex a paragraph element with attributes' do example 'lex a paragraph element with attributes' do
lex('<p class="foo">Hello</p>').should == [ lex('<p class="foo">Hello</p>').should == [
[:T_ELEM_OPEN, 'p', 1, 1], [:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NAME, 'p', 1, 2],
[:T_ATTR, 'class', 1, 4], [:T_ATTR, 'class', 1, 4],
[:T_STRING, 'foo', 1, 10], [:T_STRING, 'foo', 1, 10],
[:T_TEXT, 'Hello', 1, 16], [:T_TEXT, 'Hello', 1, 16],
[:T_ELEM_CLOSE, 'p', 1, 21] [:T_ELEM_CLOSE, nil, 1, 21]
] ]
end end
end end
@ -37,22 +41,26 @@ describe Oga::Lexer do
context 'nested elements' do context 'nested elements' do
example 'lex a nested element' do example 'lex a nested element' do
lex('<p><a></a></p>').should == [ lex('<p><a></a></p>').should == [
[:T_ELEM_OPEN, 'p', 1, 1], [:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_OPEN, 'a', 1, 4], [:T_ELEM_NAME, 'p', 1, 2],
[:T_ELEM_CLOSE, 'a', 1, 7], [:T_ELEM_OPEN, nil, 1, 4],
[:T_ELEM_CLOSE, 'p', 1, 11] [:T_ELEM_NAME, 'a', 1, 5],
[:T_ELEM_CLOSE, nil, 1, 7],
[:T_ELEM_CLOSE, nil, 1, 11]
] ]
end end
example 'lex nested elements and text nodes' do example 'lex nested elements and text nodes' do
lex('<p>Foo<a>bar</a>baz</p>').should == [ lex('<p>Foo<a>bar</a>baz</p>').should == [
[:T_ELEM_OPEN, 'p', 1, 1], [:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NAME, 'p', 1, 2],
[:T_TEXT, 'Foo', 1, 4], [:T_TEXT, 'Foo', 1, 4],
[:T_ELEM_OPEN, 'a', 1, 7], [:T_ELEM_OPEN, nil, 1, 7],
[:T_ELEM_NAME, 'a', 1, 8],
[:T_TEXT, 'bar', 1, 10], [:T_TEXT, 'bar', 1, 10],
[:T_ELEM_CLOSE, 'a', 1, 13], [:T_ELEM_CLOSE, nil, 1, 13],
[:T_TEXT, 'baz', 1, 17], [:T_TEXT, 'baz', 1, 17],
[:T_ELEM_CLOSE, 'p', 1, 20] [:T_ELEM_CLOSE, nil, 1, 20]
] ]
end end
end end
@ -60,18 +68,31 @@ describe Oga::Lexer do
context 'void elements' do context 'void elements' do
example 'lex a void element' do example 'lex a void element' do
lex('<br />').should == [ lex('<br />').should == [
[:T_ELEM_OPEN, 'br', 1, 1], [:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NAME, 'br', 1, 2],
[:T_ELEM_CLOSE, nil, 1, 6] [:T_ELEM_CLOSE, nil, 1, 6]
] ]
end end
example 'lex a void element with an attribute' do example 'lex a void element with an attribute' do
lex('<br class="foo" />').should == [ lex('<br class="foo" />').should == [
[:T_ELEM_OPEN, 'br', 1, 1], [:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NAME, 'br', 1, 2],
[:T_ATTR, 'class', 1, 5], [:T_ATTR, 'class', 1, 5],
[:T_STRING, 'foo', 1, 11], [:T_STRING, 'foo', 1, 11],
[:T_ELEM_CLOSE, nil, 1, 18] [:T_ELEM_CLOSE, nil, 1, 18]
] ]
end end
end end
context 'elements with namespaces' do
example 'lex an element with namespaces' do
lex('<foo:p></p>').should == [
[:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NS, 'foo', 1, 2],
[:T_ELEM_NAME, 'p', 1, 6],
[:T_ELEM_CLOSE, nil, 1, 8]
]
end
end
end end