From a5a3b8db3f6cfe08f3d11e1026999f8ff4988b2f Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Mon, 3 Mar 2014 22:08:46 +0100 Subject: [PATCH] Basic lexing of HTML tags. The current implementation is a bit messy. In particular the counting of column numbers is not entirely the way it should be. There are also some problems with nested tags/text that I still have to resolve. --- lib/oga/lexer.rl | 84 +++++++++++++++++++++++++-------- spec/oga/lexer/elements_spec.rb | 47 ++++++++++++++++++ spec/oga/lexer/general_spec.rb | 15 ++---- spec/oga/lexer/tags_spec.rb | 72 ---------------------------- 4 files changed, 116 insertions(+), 102 deletions(-) create mode 100644 spec/oga/lexer/elements_spec.rb delete mode 100644 spec/oga/lexer/tags_spec.rb diff --git a/lib/oga/lexer.rl b/lib/oga/lexer.rl index 73a8411..899776e 100644 --- a/lib/oga/lexer.rl +++ b/lib/oga/lexer.rl @@ -79,6 +79,8 @@ module Oga end def emit_text_buffer + return if @text_buffer.empty? + add_token(:T_TEXT, @text_buffer) @text_buffer = '' @@ -98,12 +100,8 @@ module Oga newline = '\n' | '\r\n'; whitespace = [ \t]; - action emit_space { - t(:T_SPACE) - } - action emit_newline { - t(:T_NEWLINE) + t(:T_TEXT) advance_line } @@ -228,9 +226,66 @@ module Oga any => buffer_text; *|; + # Elements + # + # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements + # + element_name = [a-zA-Z0-9\-_]+; + element_start = '<' element_name; + + # First emit the token, then advance the column. This way the column + # number points to the < and not the "p" in

. + action open_element { + t(:T_ELEM_OPEN, p) + + advance_column + + fcall element; + } + + element_text := |* + ^'<' => buffer_text; + + '<' => { + emit_text_buffer + fhold; + fret; + }; + *|; + + element := |* + whitespace => { advance_column }; + + element_start => open_element; + + # Consume the text inside the element. + '>' => { + advance_column + fcall element_text; + }; + + # Attributes and their values. + element_name + %{ + t(:T_ATTR, @ts, p) + } + '=' (dquote @string_dquote | squote @string_squote); + + # Non self-closing tags. + ' emit_space; - newline => emit_newline; + newline => emit_newline; doctype_start => { t(:T_DOCTYPE_START) @@ -247,19 +302,10 @@ module Oga fcall comment; }; - # General rules and actions. - '<' => { t(:T_SMALLER) }; - '>' => { t(:T_GREATER) }; - '/' => { t(:T_SLASH) }; - '-' => { t(:T_DASH) }; - ']' => { t(:T_RBRACKET) }; - '[' => { t(:T_LBRACKET) }; - ':' => { t(:T_COLON) }; - '!' => { t(:T_BANG) }; - '=' => { t(:T_EQUALS) }; + element_start => open_element; - dquote => { t(:T_DQUOTE) }; - squote => { t(:T_SQUOTE) }; + #dquote => { t(:T_DQUOTE) }; + #squote => { t(:T_SQUOTE) }; *|; }%% end # Lexer diff --git a/spec/oga/lexer/elements_spec.rb b/spec/oga/lexer/elements_spec.rb new file mode 100644 index 0000000..5fa5503 --- /dev/null +++ b/spec/oga/lexer/elements_spec.rb @@ -0,0 +1,47 @@ +require 'spec_helper' + +describe Oga::Lexer do + context 'elements' do + example 'lex an opening element' do + lex('

').should == [ + [:T_ELEM_OPEN, 'p', 1, 1] + ] + end + + example 'lex an opening an closing element' do + lex('

').should == [ + [:T_ELEM_OPEN, 'p', 1, 1], + [:T_ELEM_CLOSE, 'p', 1, 4] + ] + end + + example 'lex a paragraph element with text inside it' do + lex('

Hello

').should == [ + [:T_ELEM_OPEN, 'p', 1, 1], + [:T_TEXT, 'Hello', 1, 4], + [:T_ELEM_CLOSE, 'p', 1, 9] + ] + end + + example 'lex a paragraph element with attributes' do + lex('

Hello

').should == [ + [:T_ELEM_OPEN, 'p', 1, 1], + [:T_ATTR, 'class', 1, 4], + [:T_STRING, 'foo', 1, 10], + [:T_TEXT, 'Hello', 1, 15], + [:T_ELEM_CLOSE, 'p', 1, 20] + ] + end + end + + context 'nested elements' do + example 'lex a nested element' do + lex('

').should == [ + [:T_ELEM_OPEN, 'p', 1, 1], + [:T_ELEM_OPEN, 'a', 1, 4], + [:T_ELEM_CLOSE, 'a', 1, 7], + [:T_ELEM_CLOSE, 'p', 1, 11] + ] + end + end +end diff --git a/spec/oga/lexer/general_spec.rb b/spec/oga/lexer/general_spec.rb index 035bf9b..f65ca01 100644 --- a/spec/oga/lexer/general_spec.rb +++ b/spec/oga/lexer/general_spec.rb @@ -9,24 +9,17 @@ describe Oga::Lexer do context 'whitespace' do example 'lex regular whitespace' do - lex(' ').should == [[:T_SPACE, ' ', 1, 1]] + lex(' ').should == [[:T_TEXT, ' ', 1, 1]] end example 'lex a newline' do - lex("\n").should == [[:T_NEWLINE, "\n", 1, 1]] - end - - example 'advance column numbers for spaces' do - lex(' ').should == [ - [:T_SPACE, ' ', 1, 1], - [:T_SPACE, ' ', 1, 2] - ] + lex("\n").should == [[:T_TEXT, "\n", 1, 1]] end example 'advance line numbers for newlines' do lex("\n ").should == [ - [:T_NEWLINE, "\n", 1, 1], - [:T_SPACE, ' ', 2, 1] + [:T_TEXT, "\n", 1, 1], + [:T_TEXT, ' ', 2, 1] ] end end diff --git a/spec/oga/lexer/tags_spec.rb b/spec/oga/lexer/tags_spec.rb deleted file mode 100644 index 51453fe..0000000 --- a/spec/oga/lexer/tags_spec.rb +++ /dev/null @@ -1,72 +0,0 @@ -require 'spec_helper' - -describe Oga::Lexer do - context 'tags' do - example 'lex an opening tag' do - lex('

').should == [ - [:T_SMALLER, '<', 1, 1], - [:T_TEXT, 'p', 1, 2], - [:T_GREATER, '>', 1, 3] - ] - end - - example 'lex an opening tag with an attribute' do - lex('

').should == [ - [:T_SMALLER, '<', 1, 1], - [:T_TEXT, 'p', 1, 2], - [:T_SPACE, ' ', 1, 3], - [:T_TEXT, 'title', 1, 4], - [:T_EQUALS, '=', 1, 9], - [:T_DQUOTE, '"', 1, 10], - [:T_TEXT, 'Foo', 1, 11], - [:T_DQUOTE, '"', 1, 14], - [:T_GREATER, '>', 1, 15] - ] - end - - example 'lex a tag with text inside it' do - lex('

Foo

').should == [ - [:T_SMALLER, '<', 1, 1], - [:T_TEXT, 'p', 1, 2], - [:T_GREATER, '>', 1, 3], - [:T_TEXT, 'Foo', 1, 4], - [:T_SMALLER, '<', 1, 7], - [:T_SLASH, '/', 1, 8], - [:T_TEXT, 'p', 1, 9], - [:T_GREATER, '>', 1, 10] - ] - end - - example 'lex a tag with an attribute with a dash in it' do - lex('

').should == [ - [:T_SMALLER, '<', 1, 1], - [:T_TEXT, 'p', 1, 2], - [:T_SPACE, ' ', 1, 3], - [:T_TEXT, 'foo', 1, 4], - [:T_DASH, '-', 1, 7], - [:T_TEXT, 'bar', 1, 8], - [:T_EQUALS, '=', 1, 11], - [:T_DQUOTE, '"', 1, 12], - [:T_TEXT, 'baz', 1, 13], - [:T_DQUOTE, '"', 1, 16], - [:T_GREATER, '>', 1, 17] - ] - end - end - - context 'tags with namespaces' do - example 'lex a tag with a dummy namespace' do - lex('

').should == [ - [:T_SMALLER, '<', 1, 1], - [:T_TEXT, 'foo', 1, 2], - [:T_COLON, ':', 1, 5], - [:T_TEXT, 'p', 1, 6], - [:T_GREATER, '>', 1, 7], - [:T_SMALLER, '<', 1, 8], - [:T_SLASH, '/', 1, 9], - [:T_TEXT, 'p', 1, 10], - [:T_GREATER, '>', 1, 11] - ] - end - end -end