diff --git a/lib/oga/lexer.rl b/lib/oga/lexer.rl index a922a46..61d229d 100644 --- a/lib/oga/lexer.rl +++ b/lib/oga/lexer.rl @@ -6,6 +6,27 @@ module Oga class Lexer %% write data; # % + attr_reader :html + + HTML_VOID_ELEMENTS = [ + 'area', + 'base', + 'br', + 'col', + 'command', + 'embed', + 'hr', + 'img', + 'input', + 'keygen', + 'link', + 'meta', + 'param', + 'source', + 'track', + 'wbr' + ] + # Lazy way of forwarding instance method calls used internally by Ragel to # their corresponding class methods. private_methods.grep(/^_lexer_/).each do |name| @@ -16,19 +37,24 @@ module Oga private(name) end - def initialize + def initialize(options = {}) + options.each do |key, value| + instance_variable_set("@#{key}", value) if respond_to?(key) + end + reset end def reset - @line = 1 - @column = 1 - @data = nil - @ts = nil - @te = nil - @tokens = [] - @stack = [] - @top = 0 + @line = 1 + @column = 1 + @data = nil + @ts = nil + @te = nil + @tokens = [] + @stack = [] + @top = 0 + @elements = [] @string_buffer = '' @text_buffer = '' @@ -49,6 +75,10 @@ module Oga return tokens end + def html? + return !!html + end + private def advance_line @@ -93,6 +123,10 @@ module Oga @string_buffer = '' end + def current_element + return @elements.last + end + %%{ # Use instance variables for `ts` and friends. access @; @@ -255,6 +289,8 @@ module Oga advance_column end + @elements << name + add_token(:T_ELEM_NAME, name) fcall element; @@ -270,7 +306,11 @@ module Oga advance_line }; - ^('<' | newline) => buffer_text; + ^('<' | newline) => { + @text_buffer << text + + emit_text_buffer if @te == eof + }; '<' => { emit_text_buffer @@ -305,7 +345,15 @@ module Oga # Consume the text inside the element. '>' => { + # If HTML lexing is enabled and we're in a void element we'll bail + # out right away. + if html? and HTML_VOID_ELEMENTS.include?(current_element) + add_token(:T_ELEM_CLOSE, nil) + @elements.pop + end + advance_column + fcall element_text; }; @@ -325,6 +373,9 @@ module Oga # Non self-closing elements. ' { fcall element_closing_tag; + + @elements.pop + fret; }; @@ -332,6 +383,9 @@ module Oga '/>' => { advance_column add_token(:T_ELEM_CLOSE, nil) + + @elements.pop + fret; }; *|; diff --git a/lib/oga/parser.y b/lib/oga/parser.y index aefa1a1..6c86e5c 100644 --- a/lib/oga/parser.y +++ b/lib/oga/parser.y @@ -139,8 +139,8 @@ end ---- inner - def initialize - @lexer = Lexer.new + def initialize(options = {}) + @lexer = Lexer.new(options) end def reset diff --git a/spec/oga/lexer/html_void_elements_spec.rb b/spec/oga/lexer/html_void_elements_spec.rb new file mode 100644 index 0000000..998ce4f --- /dev/null +++ b/spec/oga/lexer/html_void_elements_spec.rb @@ -0,0 +1,45 @@ +require 'spec_helper' + +describe Oga::Lexer do + context 'HTML void elements' do + example 'lex a void element that omits the closing /' do + lex('', :html => true).should == [ + [:T_ELEM_OPEN, nil, 1, 1], + [:T_ELEM_NAME, 'link', 1, 2], + [:T_ELEM_CLOSE, nil, 1, 6] + ] + end + + example 'lex text after a void element' do + lex('foo', :html => true).should == [ + [:T_ELEM_OPEN, nil, 1, 1], + [:T_ELEM_NAME, 'link', 1, 2], + [:T_ELEM_CLOSE, nil, 1, 6], + [:T_TEXT, 'foo', 1, 7] + ] + end + + example 'lex a void element inside another element' do + lex('', :html => true).should == [ + [:T_ELEM_OPEN, nil, 1, 1], + [:T_ELEM_NAME, 'head', 1, 2], + [:T_ELEM_OPEN, nil, 1, 7], + [:T_ELEM_NAME, 'link', 1, 8], + [:T_ELEM_CLOSE, nil, 1, 12], + [:T_ELEM_CLOSE, nil, 1, 13] + ] + end + + example 'lex a void element inside another element with whitespace' do + lex("\n", :html => true).should == [ + [:T_ELEM_OPEN, nil, 1, 1], + [:T_ELEM_NAME, 'head', 1, 2], + [:T_ELEM_OPEN, nil, 1, 7], + [:T_ELEM_NAME, 'link', 1, 8], + [:T_ELEM_CLOSE, nil, 1, 12], + [:T_TEXT, "\n", 1, 13], + [:T_ELEM_CLOSE, nil, 2, 1] + ] + end + end +end diff --git a/spec/oga/parser/html_void_elements_spec.rb b/spec/oga/parser/html_void_elements_spec.rb new file mode 100644 index 0000000..666007c --- /dev/null +++ b/spec/oga/parser/html_void_elements_spec.rb @@ -0,0 +1,64 @@ +require 'spec_helper' + +describe Oga::Parser do + context 'HTML void elements' do + example 'parse a void element that omits the closing /' do + parse_html('').should == s( + :document, + s(:element, nil, 'link', nil, nil) + ) + end + + example 'parse a void element inside another element' do + parse_html('').should == s( + :document, + s(:element, nil, 'head', nil, s(:element, nil, 'link', nil, nil)) + ) + end + + example 'parse a void element with attributes inside another element' do + parse_html('').should == s( + :document, + s( + :element, + nil, + 'head', + nil, + s( + :element, + nil, + 'link', + s(:attributes, s(:attribute, 'href', 'foo.css')), + nil + ) + ) + ) + end + + example 'parse a void element and a non void element in the same parent' do + parse_html('Foo').should == s( + :document, + s( + :element, + nil, + 'head', + nil, + s( + :element, + nil, + 'link', + nil, + nil + ), + s( + :element, + nil, + 'title', + nil, + s(:text, 'Foo') + ) + ) + ) + end + end +end diff --git a/spec/support/parsing.rb b/spec/support/parsing.rb index 1488add..6a0fa5a 100644 --- a/spec/support/parsing.rb +++ b/spec/support/parsing.rb @@ -15,10 +15,11 @@ module Oga # Lexes a string and returns the tokens. # # @param [String] input + # @param [Hash] options # @return [Array] # - def lex(input) - return Oga::Lexer.new.lex(input) + def lex(input, options = {}) + return Oga::Lexer.new(options).lex(input) end ## @@ -28,7 +29,7 @@ module Oga # @return [Oga::AST::Node] # def parse_html(input) - return Oga::Parser.new.parse(input) + return Oga::Parser.new(:html => true).parse(input) end end # ParsingHelpers end # Oga