From 4f94d03a85f6acabe5cc57ba8c778928e42186be Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Wed, 19 Aug 2015 01:36:06 +0200 Subject: [PATCH] Improved NodeSet performance by a factor of 50 This change is broken up in to two parts: 1. Using a Hash to track if a node is already in a NodeSet 2. Only calling take_ownership when an owner is set == Using a Hash Previously various methods such as NodeSet#push and NodeSet#unshift would call Array#include? (on the internal "nodes" Array) to see if a node is already present in the set. This is quite problematic performance wise, especially for large NodeSets. In fact, for the attached benchmark the vast majority of the time was spent in Array#include? calls. Because a NodeSet demands ordering of nodes and must be able to access them by index (something Set can't do without relying on Enumerable), a Hash is used to separately keep track of what nodes are in a NodeSet. This means that checking the presence of a node is simply a matter of checking a Hash key's presence. == Calling take_ownership The if-check for the "owner" variable has been moved out of the "take_ownership" method and into the methods that call "take_ownership". This ensures the method isn't called in the first place if no owner is present, at the cost of slightly more code repetition. The same applies to the "remove_ownership" method. == Conclusion The combined result is a speedup of about 50x when running the attached concurrent_time_bench.rb benchmark. --- .../xpath/evaluator/concurrent_time_bench.rb | 51 +++++++++++++++ lib/oga/xml/node_set.rb | 65 +++++++++++++++---- 2 files changed, 103 insertions(+), 13 deletions(-) create mode 100644 benchmark/xpath/evaluator/concurrent_time_bench.rb diff --git a/benchmark/xpath/evaluator/concurrent_time_bench.rb b/benchmark/xpath/evaluator/concurrent_time_bench.rb new file mode 100644 index 0000000..96a0bcf --- /dev/null +++ b/benchmark/xpath/evaluator/concurrent_time_bench.rb @@ -0,0 +1,51 @@ +require_relative '../../benchmark_helper' +require 'thread' + +puts 'Preparing...' + +xml = read_big_kaf +xpath = 'KAF/terms/term' +xpath_ast = Oga::XPath::Parser.new(xpath).parse +output = Queue.new + +stop = false +threads = [] + +thread_count = ENV['THREADS'] ? ENV['THREADS'].to_i : 5 + +trap 'INT' do + stop = true +end + +# Parse these outside of the profiler +documents = thread_count.times.map { Oga.parse_xml(xml) } + +puts 'Starting threads...' + +require 'profile' if ENV['PROFILE'] + +thread_count.times.each do + threads << Thread.new do + oga_doc = documents.pop + evaluator = Oga::XPath::Evaluator.new(oga_doc) + + 10.times do + break if stop + + output << Benchmark.measure { evaluator.evaluate_ast(xpath_ast) } + end + end +end + +threads.each(&:join) + +samples = [] + +until output.empty? + samples << output.pop.real +end + +average = samples.inject(:+) / samples.length + +puts "Samples: #{samples.length}" +puts "Average: #{average.round(4)} seconds" diff --git a/lib/oga/xml/node_set.rb b/lib/oga/xml/node_set.rb index 78213ed..3ebcaa9 100644 --- a/lib/oga/xml/node_set.rb +++ b/lib/oga/xml/node_set.rb @@ -42,10 +42,15 @@ module Oga # @param [Oga::XML::NodeSet] owner The owner of the set. # def initialize(nodes = [], owner = nil) - @nodes = nodes - @owner = owner + @nodes = nodes + @owner = owner + @existing = {} - @nodes.each { |node| take_ownership(node) } if owner + @nodes.each do |node| + mark_existing(node) + + take_ownership(node) if @owner + end end ## @@ -103,11 +108,13 @@ module Oga # @param [Oga::XML::Node] node # def push(node) - return if @nodes.include?(node) + return if exists?(node) @nodes << node - take_ownership(node) + mark_existing(node) + + take_ownership(node) if @owner end alias_method :<<, :push @@ -118,11 +125,13 @@ module Oga # @param [Oga::XML::Node] node # def unshift(node) - return if @nodes.include?(node) + return if exists?(node) @nodes.unshift(node) - take_ownership(node) + mark_existing(node) + + take_ownership(node) if @owner end ## @@ -133,7 +142,11 @@ module Oga def shift node = @nodes.shift - remove_ownership(node) + if node + unmark_existing(node) + + remove_ownership(node) if @owner + end node end @@ -146,7 +159,11 @@ module Oga def pop node = @nodes.pop - remove_ownership(node) + if node + unmark_existing(node) + + remove_ownership(node) if @owner + end node end @@ -158,11 +175,13 @@ module Oga # @param [Oga::XML::Node] node # def insert(index, node) - return if @nodes.include?(node) + return if exists?(node) @nodes.insert(index, node) - take_ownership(node) + mark_existing(node) + + take_ownership(node) if @owner end ## @@ -257,7 +276,11 @@ module Oga def delete(node) removed = @nodes.delete(node) - remove_ownership(removed) if removed + if removed + unmark_existing(removed) + + remove_ownership(removed) if @owner + end removed end @@ -317,7 +340,7 @@ module Oga # @param [Oga::XML::Node] node # def take_ownership(node) - node.node_set = self if owner + node.node_set = self end ## @@ -328,6 +351,22 @@ module Oga def remove_ownership(node) node.node_set = nil if node.node_set == self end + + # @param [Oga::XML::Node|Oga::XML::Attribute] node + # @return [TrueClass|FalseClass] + def exists?(node) + @existing.key?(node) + end + + # @param [Oga::XML::Node|Oga::XML::Attribute] node + def mark_existing(node) + @existing[node] = true + end + + # @param [Oga::XML::Node|Oga::XML::Attribute] node + def unmark_existing(node) + @existing.delete(node) + end end # NodeSet end # XML end # Oga