From 74bc11a2394834fafc753c3e62bb056b89dcfd42 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Thu, 20 Mar 2014 19:44:28 +0100 Subject: [PATCH] Rip out column counting. This makes both the lexer and parser quite a bit easier to use. Counting column numbers isn't also really needed when parsing XML/HTML. --- lib/oga/ast/node.rb | 2 +- lib/oga/lexer.rl | 35 ++------ lib/oga/parser.y | 20 ++--- spec/oga/lexer/cdata_spec.rb | 18 ++--- spec/oga/lexer/comments_spec.rb | 40 ++++----- spec/oga/lexer/doctype_spec.rb | 24 +++--- spec/oga/lexer/documents_spec.rb | 44 +++++----- spec/oga/lexer/elements_spec.rb | 98 +++++++++++------------ spec/oga/lexer/general_spec.rb | 8 +- spec/oga/lexer/html_void_elements_spec.rb | 40 ++++----- 10 files changed, 148 insertions(+), 181 deletions(-) diff --git a/lib/oga/ast/node.rb b/lib/oga/ast/node.rb index 5b49908..5f628ca 100644 --- a/lib/oga/ast/node.rb +++ b/lib/oga/ast/node.rb @@ -3,7 +3,7 @@ module Oga ## # class Node < ::AST::Node - attr_reader :line, :column + attr_reader :line end # Node end # AST end # Oga diff --git a/lib/oga/lexer.rl b/lib/oga/lexer.rl index a320aac..d041919 100644 --- a/lib/oga/lexer.rl +++ b/lib/oga/lexer.rl @@ -72,7 +72,6 @@ module Oga # def reset @line = 1 - @column = 1 @data = nil @ts = nil @te = nil @@ -124,15 +123,7 @@ module Oga # @param [Fixnum] amount The amount of lines to advance. # def advance_line(amount = 1) - @line += amount - @column = 1 - end - - ## - # @param [Fixnum] length The amount of columns to advance. - # - def advance_column(length = 1) - @column += length + @line += amount end ## @@ -166,16 +157,13 @@ module Oga end ## - # Adds a token with the given type and value to the list. If a value is - # given the column number is also advanced based on the value's length. + # Adds a token with the given type and value to the list. # # @param [Symbol] type The token type. # @param [String] value The token value. # def add_token(type, value) - token = [type, value, @line, @column] - - advance_column(value.length) if value + token = [type, value, @line] @tokens << token end @@ -214,7 +202,6 @@ module Oga # def emit_string_buffer add_token(:T_STRING, @string_buffer) - advance_column @string_buffer = '' end @@ -264,7 +251,6 @@ module Oga ^dquote => buffer_string; dquote => { emit_string_buffer - advance_column fret; }; *|; @@ -274,7 +260,6 @@ module Oga ^squote => buffer_string; squote => { emit_string_buffer - advance_column fret; }; *|; @@ -308,7 +293,7 @@ module Oga # Whitespace inside doctypes is ignored since there's no point in # including it. - whitespace => { advance_column }; + whitespace; '>' => { t(:T_DOCTYPE_END) @@ -389,7 +374,6 @@ module Oga action start_element { emit_text_buffer add_token(:T_ELEM_OPEN, nil) - advance_column # Add the element name. If the name includes a namespace we'll break # the name up into two separate tokens. @@ -399,10 +383,6 @@ module Oga ns, name = name.split(':') add_token(:T_ELEM_NS, ns) - - # Advance the column for the colon (:) that separates the namespace - # and element name. - advance_column end @elements << name @@ -422,7 +402,7 @@ module Oga # For example, in `

` the element head is ` foo="bar"`. # element_head := |* - (whitespace | '=') => { advance_column }; + whitespace | '='; # Attribute names. element_name => { t(:T_ATTR) }; @@ -452,8 +432,6 @@ module Oga add_token(:T_ELEM_CLOSE, nil) @elements.pop end - - advance_column }; # Regular closing tags. @@ -461,14 +439,11 @@ module Oga emit_text_buffer add_token(:T_ELEM_CLOSE, nil) - advance_column(@te - @ts) - @elements.pop }; # Self closing elements that are not handled by the HTML mode. '/>' => { - advance_column add_token(:T_ELEM_CLOSE, nil) @elements.pop diff --git a/lib/oga/parser.y b/lib/oga/parser.y index 0be34ae..7f04ee6 100644 --- a/lib/oga/parser.y +++ b/lib/oga/parser.y @@ -124,25 +124,22 @@ end end def reset - @lines = [] - @line = 1 - @column = 1 + @lines = [] + @line = 1 end def s(type, *children) return AST::Node.new( type, children.flatten, - :line => @line, - :column => @column + :line => @line ) end def next_token - type, value, line, column = @tokens.shift + type, value, line = @tokens.shift - @line = line if line - @column = column if column + @line = line if line return type ? [type, value] : [false, false] end @@ -150,18 +147,13 @@ end def on_error(type, value, stack) name = token_to_str(type) line_str = @lines[@line - 1] - indicator = '~' * (@column - 1) + '^' raise Racc::ParseError, <<-EOF.strip -Failed to parse the supplied input. - -Reason: unexpected #{name} with value #{value.inspect} -Location: line #{@line}, column #{@column} +Unexpected #{name} with value #{value.inspect} on line #{@line} Offending code: #{line_str} -#{indicator} Current stack: diff --git a/spec/oga/lexer/cdata_spec.rb b/spec/oga/lexer/cdata_spec.rb index d70cef1..bf41f24 100644 --- a/spec/oga/lexer/cdata_spec.rb +++ b/spec/oga/lexer/cdata_spec.rb @@ -4,25 +4,25 @@ describe Oga::Lexer do context 'cdata tags' do example 'lex a cdata tag' do lex('').should == [ - [:T_CDATA_START, '', 1, 13] + [:T_CDATA_START, '', 1] ] end example 'lex tags inside CDATA tags as regular text' do lex('Foo

]]>').should == [ - [:T_CDATA_START, 'Foo

', 1, 10], - [:T_CDATA_END, ']]>', 1, 20] + [:T_CDATA_START, 'Foo

', 1], + [:T_CDATA_END, ']]>', 1] ] end example 'lex double brackets inside a CDATA tag' do lex('').should == [ - [:T_CDATA_START, '', 1, 12] + [:T_CDATA_START, '', 1] ] end end diff --git a/spec/oga/lexer/comments_spec.rb b/spec/oga/lexer/comments_spec.rb index 254bb22..63edafc 100644 --- a/spec/oga/lexer/comments_spec.rb +++ b/spec/oga/lexer/comments_spec.rb @@ -4,51 +4,51 @@ describe Oga::Lexer do context 'comments' do example 'lex a comment' do lex('').should == [ - [:T_COMMENT_START, '', 1, 10] + [:T_COMMENT_START, '', 1] ] end example 'lex a comment containing --' do lex('').should == [ - [:T_COMMENT_START, '', 1, 9] + [:T_COMMENT_START, '', 1] ] end example 'lex a comment containing ->' do lex('').should == [ - [:T_COMMENT_START, '', 1, 9] + [:T_COMMENT_START, '', 1] ] end example 'lex a comment followed by text' do lex('foo').should == [ - [:T_COMMENT_START, '', 1, 5], - [:T_TEXT, 'foo', 1, 8] + [:T_COMMENT_START, '', 1], + [:T_TEXT, 'foo', 1] ] end example 'lex text followed by a comment' do lex('foo').should == [ - [:T_TEXT, 'foo', 1, 1], - [:T_COMMENT_START, '', 1, 8] + [:T_TEXT, 'foo', 1], + [:T_COMMENT_START, '', 1] ] end example 'lex an element followed by a comment' do lex('

').should == [ - [:T_ELEM_OPEN, nil, 1, 1], - [:T_ELEM_NAME, 'p', 1, 2], - [:T_ELEM_CLOSE, nil, 1, 4], - [:T_COMMENT_START, '', 1, 12] + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NAME, 'p', 1], + [:T_ELEM_CLOSE, nil, 1], + [:T_COMMENT_START, '', 1] ] end end diff --git a/spec/oga/lexer/doctype_spec.rb b/spec/oga/lexer/doctype_spec.rb index dad3ea6..d96e59d 100644 --- a/spec/oga/lexer/doctype_spec.rb +++ b/spec/oga/lexer/doctype_spec.rb @@ -4,28 +4,28 @@ describe Oga::Lexer do context 'doctypes' do example 'lex the HTML5 doctype' do lex('').should == [ - [:T_DOCTYPE_START, '', 1, 15] + [:T_DOCTYPE_START, '', 1] ] end example 'lex a doctype with a public and system ID' do lex('').should == [ - [:T_DOCTYPE_START, '', 1, 37] + [:T_DOCTYPE_START, '', 1] ] end example 'lex a doctype with a public and system ID using single quotes' do lex("").should == [ - [:T_DOCTYPE_START, '', 1, 37] + [:T_DOCTYPE_START, '', 1] ] end end diff --git a/spec/oga/lexer/documents_spec.rb b/spec/oga/lexer/documents_spec.rb index 629a53c..3564b63 100644 --- a/spec/oga/lexer/documents_spec.rb +++ b/spec/oga/lexer/documents_spec.rb @@ -14,40 +14,40 @@ describe Oga::Lexer do EOF lex(html).should == [ - [:T_DOCTYPE_START, '', 1, 15], - [:T_TEXT, "\n", 1, 16], + [:T_DOCTYPE_START, '', 1], + [:T_TEXT, "\n", 1], # - [:T_ELEM_OPEN, nil, 2, 1], - [:T_ELEM_NAME, 'html', 2, 2], - [:T_TEXT, "\n", 2, 7], + [:T_ELEM_OPEN, nil, 2], + [:T_ELEM_NAME, 'html', 2], + [:T_TEXT, "\n", 2], # - [:T_ELEM_OPEN, nil, 3, 1], - [:T_ELEM_NAME, 'head', 3, 2], - [:T_TEXT, "\n", 3, 7], + [:T_ELEM_OPEN, nil, 3], + [:T_ELEM_NAME, 'head', 3], + [:T_TEXT, "\n", 3], # Title - [:T_ELEM_OPEN, nil, 4, 1], - [:T_ELEM_NAME, 'title', 4, 2], - [:T_TEXT, 'Title', 4, 8], - [:T_ELEM_CLOSE, nil, 4, 13], - [:T_TEXT, "\n", 4, 21], + [:T_ELEM_OPEN, nil, 4], + [:T_ELEM_NAME, 'title', 4], + [:T_TEXT, 'Title', 4], + [:T_ELEM_CLOSE, nil, 4], + [:T_TEXT, "\n", 4], # - [:T_ELEM_CLOSE, nil, 5, 1], - [:T_TEXT, "\n", 5, 8], + [:T_ELEM_CLOSE, nil, 5], + [:T_TEXT, "\n", 5], # - [:T_ELEM_OPEN, nil, 6, 1], - [:T_ELEM_NAME, 'body', 6, 2], - [:T_ELEM_CLOSE, nil, 6, 7], - [:T_TEXT, "\n", 6, 14], + [:T_ELEM_OPEN, nil, 6], + [:T_ELEM_NAME, 'body', 6], + [:T_ELEM_CLOSE, nil, 6], + [:T_TEXT, "\n", 6], # - [:T_ELEM_CLOSE, nil, 7, 1], - [:T_TEXT, "\n", 7, 8] + [:T_ELEM_CLOSE, nil, 7], + [:T_TEXT, "\n", 7] ] end end diff --git a/spec/oga/lexer/elements_spec.rb b/spec/oga/lexer/elements_spec.rb index 893a86b..6022c61 100644 --- a/spec/oga/lexer/elements_spec.rb +++ b/spec/oga/lexer/elements_spec.rb @@ -4,33 +4,33 @@ describe Oga::Lexer do context 'elements' do example 'lex an opening element' do lex('

').should == [ - [:T_ELEM_OPEN, nil, 1, 1], - [:T_ELEM_NAME, 'p', 1, 2] + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NAME, 'p', 1] ] end example 'lex an opening an closing element' do lex('

').should == [ - [:T_ELEM_OPEN, nil, 1, 1], - [:T_ELEM_NAME, 'p', 1, 2], - [:T_ELEM_CLOSE, nil, 1, 4] + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NAME, 'p', 1], + [:T_ELEM_CLOSE, nil, 1] ] end example 'lex a paragraph element with text inside it' do lex('

Hello

').should == [ - [:T_ELEM_OPEN, nil, 1, 1], - [:T_ELEM_NAME, 'p', 1, 2], - [:T_TEXT, 'Hello', 1, 4], - [:T_ELEM_CLOSE, nil, 1, 9] + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NAME, 'p', 1], + [:T_TEXT, 'Hello', 1], + [:T_ELEM_CLOSE, nil, 1] ] end example 'lex text followed by a paragraph element' do lex('Foo

').should == [ - [:T_TEXT, 'Foo', 1, 1], - [:T_ELEM_OPEN, nil, 1, 4], - [:T_ELEM_NAME, 'p', 1, 5] + [:T_TEXT, 'Foo', 1], + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NAME, 'p', 1] ] end end @@ -38,21 +38,21 @@ describe Oga::Lexer do context 'elements with attributes' do example 'lex an element with an attribute without a value' do lex('

').should == [ - [:T_ELEM_OPEN, nil, 1, 1], - [:T_ELEM_NAME, 'p', 1, 2], - [:T_ATTR, 'foo', 1, 4], - [:T_ELEM_CLOSE, nil, 1, 8] + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NAME, 'p', 1], + [:T_ATTR, 'foo', 1], + [:T_ELEM_CLOSE, nil, 1] ] end example 'lex a paragraph element with attributes' do lex('

Hello

').should == [ - [:T_ELEM_OPEN, nil, 1, 1], - [:T_ELEM_NAME, 'p', 1, 2], - [:T_ATTR, 'class', 1, 4], - [:T_STRING, 'foo', 1, 10], - [:T_TEXT, 'Hello', 1, 16], - [:T_ELEM_CLOSE, nil, 1, 21] + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NAME, 'p', 1], + [:T_ATTR, 'class', 1], + [:T_STRING, 'foo', 1], + [:T_TEXT, 'Hello', 1], + [:T_ELEM_CLOSE, nil, 1] ] end end @@ -60,26 +60,26 @@ describe Oga::Lexer do context 'nested elements' do example 'lex a nested element' do lex('

').should == [ - [:T_ELEM_OPEN, nil, 1, 1], - [:T_ELEM_NAME, 'p', 1, 2], - [:T_ELEM_OPEN, nil, 1, 4], - [:T_ELEM_NAME, 'a', 1, 5], - [:T_ELEM_CLOSE, nil, 1, 7], - [:T_ELEM_CLOSE, nil, 1, 11] + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NAME, 'p', 1], + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NAME, 'a', 1], + [:T_ELEM_CLOSE, nil, 1], + [:T_ELEM_CLOSE, nil, 1] ] end example 'lex nested elements and text nodes' do lex('

Foobarbaz

').should == [ - [:T_ELEM_OPEN, nil, 1, 1], - [:T_ELEM_NAME, 'p', 1, 2], - [:T_TEXT, 'Foo', 1, 4], - [:T_ELEM_OPEN, nil, 1, 7], - [:T_ELEM_NAME, 'a', 1, 8], - [:T_TEXT, 'bar', 1, 10], - [:T_ELEM_CLOSE, nil, 1, 13], - [:T_TEXT, 'baz', 1, 17], - [:T_ELEM_CLOSE, nil, 1, 20] + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NAME, 'p', 1], + [:T_TEXT, 'Foo', 1], + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NAME, 'a', 1], + [:T_TEXT, 'bar', 1], + [:T_ELEM_CLOSE, nil, 1], + [:T_TEXT, 'baz', 1], + [:T_ELEM_CLOSE, nil, 1] ] end end @@ -87,19 +87,19 @@ describe Oga::Lexer do context 'void elements' do example 'lex a void element' do lex('
').should == [ - [:T_ELEM_OPEN, nil, 1, 1], - [:T_ELEM_NAME, 'br', 1, 2], - [:T_ELEM_CLOSE, nil, 1, 6] + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NAME, 'br', 1], + [:T_ELEM_CLOSE, nil, 1] ] end example 'lex a void element with an attribute' do lex('
').should == [ - [:T_ELEM_OPEN, nil, 1, 1], - [:T_ELEM_NAME, 'br', 1, 2], - [:T_ATTR, 'class', 1, 5], - [:T_STRING, 'foo', 1, 11], - [:T_ELEM_CLOSE, nil, 1, 18] + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NAME, 'br', 1], + [:T_ATTR, 'class', 1], + [:T_STRING, 'foo', 1], + [:T_ELEM_CLOSE, nil, 1] ] end end @@ -107,10 +107,10 @@ describe Oga::Lexer do context 'elements with namespaces' do example 'lex an element with namespaces' do lex('

').should == [ - [:T_ELEM_OPEN, nil, 1, 1], - [:T_ELEM_NS, 'foo', 1, 2], - [:T_ELEM_NAME, 'p', 1, 6], - [:T_ELEM_CLOSE, nil, 1, 8] + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NS, 'foo', 1], + [:T_ELEM_NAME, 'p', 1], + [:T_ELEM_CLOSE, nil, 1] ] end end diff --git a/spec/oga/lexer/general_spec.rb b/spec/oga/lexer/general_spec.rb index a474058..0c8cf87 100644 --- a/spec/oga/lexer/general_spec.rb +++ b/spec/oga/lexer/general_spec.rb @@ -3,20 +3,20 @@ require 'spec_helper' describe Oga::Lexer do context 'regular text' do example 'lex regular text' do - lex('hello').should == [[:T_TEXT, 'hello', 1, 1]] + lex('hello').should == [[:T_TEXT, 'hello', 1]] end example 'lex regular whitespace' do - lex(' ').should == [[:T_TEXT, ' ', 1, 1]] + lex(' ').should == [[:T_TEXT, ' ', 1]] end example 'lex a newline' do - lex("\n").should == [[:T_TEXT, "\n", 1, 1]] + lex("\n").should == [[:T_TEXT, "\n", 1]] end example 'lex text followed by a newline' do lex("foo\n").should == [ - [:T_TEXT, "foo\n", 1, 1] + [:T_TEXT, "foo\n", 1] ] end end diff --git a/spec/oga/lexer/html_void_elements_spec.rb b/spec/oga/lexer/html_void_elements_spec.rb index 998ce4f..58754ab 100644 --- a/spec/oga/lexer/html_void_elements_spec.rb +++ b/spec/oga/lexer/html_void_elements_spec.rb @@ -4,41 +4,41 @@ describe Oga::Lexer do context 'HTML void elements' do example 'lex a void element that omits the closing /' do lex('', :html => true).should == [ - [:T_ELEM_OPEN, nil, 1, 1], - [:T_ELEM_NAME, 'link', 1, 2], - [:T_ELEM_CLOSE, nil, 1, 6] + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NAME, 'link', 1], + [:T_ELEM_CLOSE, nil, 1] ] end example 'lex text after a void element' do lex('foo', :html => true).should == [ - [:T_ELEM_OPEN, nil, 1, 1], - [:T_ELEM_NAME, 'link', 1, 2], - [:T_ELEM_CLOSE, nil, 1, 6], - [:T_TEXT, 'foo', 1, 7] + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NAME, 'link', 1], + [:T_ELEM_CLOSE, nil, 1], + [:T_TEXT, 'foo', 1] ] end example 'lex a void element inside another element' do lex('', :html => true).should == [ - [:T_ELEM_OPEN, nil, 1, 1], - [:T_ELEM_NAME, 'head', 1, 2], - [:T_ELEM_OPEN, nil, 1, 7], - [:T_ELEM_NAME, 'link', 1, 8], - [:T_ELEM_CLOSE, nil, 1, 12], - [:T_ELEM_CLOSE, nil, 1, 13] + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NAME, 'head', 1], + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NAME, 'link', 1], + [:T_ELEM_CLOSE, nil, 1], + [:T_ELEM_CLOSE, nil, 1] ] end example 'lex a void element inside another element with whitespace' do lex("\n", :html => true).should == [ - [:T_ELEM_OPEN, nil, 1, 1], - [:T_ELEM_NAME, 'head', 1, 2], - [:T_ELEM_OPEN, nil, 1, 7], - [:T_ELEM_NAME, 'link', 1, 8], - [:T_ELEM_CLOSE, nil, 1, 12], - [:T_TEXT, "\n", 1, 13], - [:T_ELEM_CLOSE, nil, 2, 1] + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NAME, 'head', 1], + [:T_ELEM_OPEN, nil, 1], + [:T_ELEM_NAME, 'link', 1], + [:T_ELEM_CLOSE, nil, 1], + [:T_TEXT, "\n", 1], + [:T_ELEM_CLOSE, nil, 2] ] end end