Basic support for lexing/parsing HTML5.

This will need a bunch of extra tests before I'll consider closing #7.
This commit is contained in:
Yorick Peterse 2014-03-16 23:42:24 +01:00
parent ce8bbdb64a
commit cb75edc30d
5 changed files with 179 additions and 15 deletions

View File

@ -6,6 +6,27 @@ module Oga
class Lexer
%% write data; # %
attr_reader :html
HTML_VOID_ELEMENTS = [
'area',
'base',
'br',
'col',
'command',
'embed',
'hr',
'img',
'input',
'keygen',
'link',
'meta',
'param',
'source',
'track',
'wbr'
]
# Lazy way of forwarding instance method calls used internally by Ragel to
# their corresponding class methods.
private_methods.grep(/^_lexer_/).each do |name|
@ -16,19 +37,24 @@ module Oga
private(name)
end
def initialize
def initialize(options = {})
options.each do |key, value|
instance_variable_set("@#{key}", value) if respond_to?(key)
end
reset
end
def reset
@line = 1
@column = 1
@data = nil
@ts = nil
@te = nil
@tokens = []
@stack = []
@top = 0
@line = 1
@column = 1
@data = nil
@ts = nil
@te = nil
@tokens = []
@stack = []
@top = 0
@elements = []
@string_buffer = ''
@text_buffer = ''
@ -49,6 +75,10 @@ module Oga
return tokens
end
def html?
return !!html
end
private
def advance_line
@ -93,6 +123,10 @@ module Oga
@string_buffer = ''
end
def current_element
return @elements.last
end
%%{
# Use instance variables for `ts` and friends.
access @;
@ -255,6 +289,8 @@ module Oga
advance_column
end
@elements << name
add_token(:T_ELEM_NAME, name)
fcall element;
@ -270,7 +306,11 @@ module Oga
advance_line
};
^('<' | newline) => buffer_text;
^('<' | newline) => {
@text_buffer << text
emit_text_buffer if @te == eof
};
'<' => {
emit_text_buffer
@ -305,7 +345,15 @@ module Oga
# Consume the text inside the element.
'>' => {
# If HTML lexing is enabled and we're in a void element we'll bail
# out right away.
if html? and HTML_VOID_ELEMENTS.include?(current_element)
add_token(:T_ELEM_CLOSE, nil)
@elements.pop
end
advance_column
fcall element_text;
};
@ -325,6 +373,9 @@ module Oga
# Non self-closing elements.
'</' => {
fcall element_closing_tag;
@elements.pop
fret;
};
@ -332,6 +383,9 @@ module Oga
'/>' => {
advance_column
add_token(:T_ELEM_CLOSE, nil)
@elements.pop
fret;
};
*|;

View File

@ -139,8 +139,8 @@ end
---- inner
def initialize
@lexer = Lexer.new
def initialize(options = {})
@lexer = Lexer.new(options)
end
def reset

View File

@ -0,0 +1,45 @@
require 'spec_helper'
describe Oga::Lexer do
context 'HTML void elements' do
example 'lex a void element that omits the closing /' do
lex('<link>', :html => true).should == [
[:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NAME, 'link', 1, 2],
[:T_ELEM_CLOSE, nil, 1, 6]
]
end
example 'lex text after a void element' do
lex('<link>foo', :html => true).should == [
[:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NAME, 'link', 1, 2],
[:T_ELEM_CLOSE, nil, 1, 6],
[:T_TEXT, 'foo', 1, 7]
]
end
example 'lex a void element inside another element' do
lex('<head><link></head>', :html => true).should == [
[:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NAME, 'head', 1, 2],
[:T_ELEM_OPEN, nil, 1, 7],
[:T_ELEM_NAME, 'link', 1, 8],
[:T_ELEM_CLOSE, nil, 1, 12],
[:T_ELEM_CLOSE, nil, 1, 13]
]
end
example 'lex a void element inside another element with whitespace' do
lex("<head><link>\n</head>", :html => true).should == [
[:T_ELEM_OPEN, nil, 1, 1],
[:T_ELEM_NAME, 'head', 1, 2],
[:T_ELEM_OPEN, nil, 1, 7],
[:T_ELEM_NAME, 'link', 1, 8],
[:T_ELEM_CLOSE, nil, 1, 12],
[:T_TEXT, "\n", 1, 13],
[:T_ELEM_CLOSE, nil, 2, 1]
]
end
end
end

View File

@ -0,0 +1,64 @@
require 'spec_helper'
describe Oga::Parser do
context 'HTML void elements' do
example 'parse a void element that omits the closing /' do
parse_html('<link>').should == s(
:document,
s(:element, nil, 'link', nil, nil)
)
end
example 'parse a void element inside another element' do
parse_html('<head><link></head>').should == s(
:document,
s(:element, nil, 'head', nil, s(:element, nil, 'link', nil, nil))
)
end
example 'parse a void element with attributes inside another element' do
parse_html('<head><link href="foo.css"></head>').should == s(
:document,
s(
:element,
nil,
'head',
nil,
s(
:element,
nil,
'link',
s(:attributes, s(:attribute, 'href', 'foo.css')),
nil
)
)
)
end
example 'parse a void element and a non void element in the same parent' do
parse_html('<head><link><title>Foo</title></head>').should == s(
:document,
s(
:element,
nil,
'head',
nil,
s(
:element,
nil,
'link',
nil,
nil
),
s(
:element,
nil,
'title',
nil,
s(:text, 'Foo')
)
)
)
end
end
end

View File

@ -15,10 +15,11 @@ module Oga
# Lexes a string and returns the tokens.
#
# @param [String] input
# @param [Hash] options
# @return [Array]
#
def lex(input)
return Oga::Lexer.new.lex(input)
def lex(input, options = {})
return Oga::Lexer.new(options).lex(input)
end
##
@ -28,7 +29,7 @@ module Oga
# @return [Oga::AST::Node]
#
def parse_html(input)
return Oga::Parser.new.parse(input)
return Oga::Parser.new(:html => true).parse(input)
end
end # ParsingHelpers
end # Oga