diff --git a/lib/oga/lexer.rl b/lib/oga/lexer.rl index 73a8411..899776e 100644 --- a/lib/oga/lexer.rl +++ b/lib/oga/lexer.rl @@ -79,6 +79,8 @@ module Oga end def emit_text_buffer + return if @text_buffer.empty? + add_token(:T_TEXT, @text_buffer) @text_buffer = '' @@ -98,12 +100,8 @@ module Oga newline = '\n' | '\r\n'; whitespace = [ \t]; - action emit_space { - t(:T_SPACE) - } - action emit_newline { - t(:T_NEWLINE) + t(:T_TEXT) advance_line } @@ -228,9 +226,66 @@ module Oga any => buffer_text; *|; + # Elements + # + # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements + # + element_name = [a-zA-Z0-9\-_]+; + element_start = '<' element_name; + + # First emit the token, then advance the column. This way the column + # number points to the < and not the "p" in
. + action open_element { + t(:T_ELEM_OPEN, p) + + advance_column + + fcall element; + } + + element_text := |* + ^'<' => buffer_text; + + '<' => { + emit_text_buffer + fhold; + fret; + }; + *|; + + element := |* + whitespace => { advance_column }; + + element_start => open_element; + + # Consume the text inside the element. + '>' => { + advance_column + fcall element_text; + }; + + # Attributes and their values. + element_name + %{ + t(:T_ATTR, @ts, p) + } + '=' (dquote @string_dquote | squote @string_squote); + + # Non self-closing tags. + '' element_name { + emit_text_buffer + t(:T_ELEM_CLOSE, p) + + # Advance by two to take the closing into account. This is done + # after emitting tokens to ensure that they point to the start of + # the tag. + advance_column(2) + fret; + }; + *|; + main := |* - whitespace => emit_space; - newline => emit_newline; + newline => emit_newline; doctype_start => { t(:T_DOCTYPE_START) @@ -247,19 +302,10 @@ module Oga fcall comment; }; - # General rules and actions. - '<' => { t(:T_SMALLER) }; - '>' => { t(:T_GREATER) }; - '/' => { t(:T_SLASH) }; - '-' => { t(:T_DASH) }; - ']' => { t(:T_RBRACKET) }; - '[' => { t(:T_LBRACKET) }; - ':' => { t(:T_COLON) }; - '!' => { t(:T_BANG) }; - '=' => { t(:T_EQUALS) }; + element_start => open_element; - dquote => { t(:T_DQUOTE) }; - squote => { t(:T_SQUOTE) }; + #dquote => { t(:T_DQUOTE) }; + #squote => { t(:T_SQUOTE) }; *|; }%% end # Lexer diff --git a/spec/oga/lexer/elements_spec.rb b/spec/oga/lexer/elements_spec.rb new file mode 100644 index 0000000..5fa5503 --- /dev/null +++ b/spec/oga/lexer/elements_spec.rb @@ -0,0 +1,47 @@ +require 'spec_helper' + +describe Oga::Lexer do + context 'elements' do + example 'lex an opening element' do + lex('
').should == [ + [:T_ELEM_OPEN, 'p', 1, 1] + ] + end + + example 'lex an opening an closing element' do + lex('
').should == [ + [:T_ELEM_OPEN, 'p', 1, 1], + [:T_ELEM_CLOSE, 'p', 1, 4] + ] + end + + example 'lex a paragraph element with text inside it' do + lex('Hello
').should == [ + [:T_ELEM_OPEN, 'p', 1, 1], + [:T_TEXT, 'Hello', 1, 4], + [:T_ELEM_CLOSE, 'p', 1, 9] + ] + end + + example 'lex a paragraph element with attributes' do + lex('Hello
').should == [ + [:T_ELEM_OPEN, 'p', 1, 1], + [:T_ATTR, 'class', 1, 4], + [:T_STRING, 'foo', 1, 10], + [:T_TEXT, 'Hello', 1, 15], + [:T_ELEM_CLOSE, 'p', 1, 20] + ] + end + end + + context 'nested elements' do + example 'lex a nested element' do + lex('').should == [ + [:T_ELEM_OPEN, 'p', 1, 1], + [:T_ELEM_OPEN, 'a', 1, 4], + [:T_ELEM_CLOSE, 'a', 1, 7], + [:T_ELEM_CLOSE, 'p', 1, 11] + ] + end + end +end diff --git a/spec/oga/lexer/general_spec.rb b/spec/oga/lexer/general_spec.rb index 035bf9b..f65ca01 100644 --- a/spec/oga/lexer/general_spec.rb +++ b/spec/oga/lexer/general_spec.rb @@ -9,24 +9,17 @@ describe Oga::Lexer do context 'whitespace' do example 'lex regular whitespace' do - lex(' ').should == [[:T_SPACE, ' ', 1, 1]] + lex(' ').should == [[:T_TEXT, ' ', 1, 1]] end example 'lex a newline' do - lex("\n").should == [[:T_NEWLINE, "\n", 1, 1]] - end - - example 'advance column numbers for spaces' do - lex(' ').should == [ - [:T_SPACE, ' ', 1, 1], - [:T_SPACE, ' ', 1, 2] - ] + lex("\n").should == [[:T_TEXT, "\n", 1, 1]] end example 'advance line numbers for newlines' do lex("\n ").should == [ - [:T_NEWLINE, "\n", 1, 1], - [:T_SPACE, ' ', 2, 1] + [:T_TEXT, "\n", 1, 1], + [:T_TEXT, ' ', 2, 1] ] end end diff --git a/spec/oga/lexer/tags_spec.rb b/spec/oga/lexer/tags_spec.rb deleted file mode 100644 index 51453fe..0000000 --- a/spec/oga/lexer/tags_spec.rb +++ /dev/null @@ -1,72 +0,0 @@ -require 'spec_helper' - -describe Oga::Lexer do - context 'tags' do - example 'lex an opening tag' do - lex('').should == [ - [:T_SMALLER, '<', 1, 1], - [:T_TEXT, 'p', 1, 2], - [:T_GREATER, '>', 1, 3] - ] - end - - example 'lex an opening tag with an attribute' do - lex('
').should == [ - [:T_SMALLER, '<', 1, 1], - [:T_TEXT, 'p', 1, 2], - [:T_SPACE, ' ', 1, 3], - [:T_TEXT, 'title', 1, 4], - [:T_EQUALS, '=', 1, 9], - [:T_DQUOTE, '"', 1, 10], - [:T_TEXT, 'Foo', 1, 11], - [:T_DQUOTE, '"', 1, 14], - [:T_GREATER, '>', 1, 15] - ] - end - - example 'lex a tag with text inside it' do - lex('
Foo
').should == [ - [:T_SMALLER, '<', 1, 1], - [:T_TEXT, 'p', 1, 2], - [:T_GREATER, '>', 1, 3], - [:T_TEXT, 'Foo', 1, 4], - [:T_SMALLER, '<', 1, 7], - [:T_SLASH, '/', 1, 8], - [:T_TEXT, 'p', 1, 9], - [:T_GREATER, '>', 1, 10] - ] - end - - example 'lex a tag with an attribute with a dash in it' do - lex('').should == [
- [:T_SMALLER, '<', 1, 1],
- [:T_TEXT, 'p', 1, 2],
- [:T_SPACE, ' ', 1, 3],
- [:T_TEXT, 'foo', 1, 4],
- [:T_DASH, '-', 1, 7],
- [:T_TEXT, 'bar', 1, 8],
- [:T_EQUALS, '=', 1, 11],
- [:T_DQUOTE, '"', 1, 12],
- [:T_TEXT, 'baz', 1, 13],
- [:T_DQUOTE, '"', 1, 16],
- [:T_GREATER, '>', 1, 17]
- ]
- end
- end
-
- context 'tags with namespaces' do
- example 'lex a tag with a dummy namespace' do
- lex('