1/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3Licensed under the Apache License, Version 2.0 (the "License"); 4you may not use this file except in compliance with the License. 5You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9Unless required by applicable law or agreed to in writing, software 10distributed under the License is distributed on an "AS IS" BASIS, 11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12See the License for the specific language governing permissions and 13limitations under the License. 14==============================================================================*/ 15 16#ifndef TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_ 17#define TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_ 18 19#if GOOGLE_CUDA 20 21#include <unordered_map> 22 23#include "tensorflow/core/lib/strings/str_util.h" 24#include "tensorflow/core/lib/strings/strcat.h" 25#include "tensorflow/core/lib/strings/stringprintf.h" 26#include "tensorflow/core/platform/logging.h" 27#include "tensorflow/core/platform/stream_executor.h" 28 29namespace tensorflow { 30 31template <typename T> 32inline perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, 33 uint64 size) { 34 perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), 35 size * sizeof(T)); 36 perftools::gputools::DeviceMemory<T> typed(wrapped); 37 return typed; 38} 39 40// A helper class that looks up the best autotuned config from parameters. 41// Due to the noisy nature of autotune, especially with multiple devices, it 42// only accepts a config if its margin exceeds a threshold. 43// For the same shape configs, if a new best config matches the previous best, 44// they get promoted; otherwise, the winner gets demoted. This process stops 45// when the winner's score exceeds the threshold. 46// In a bad case when two configs are very close to each other and flips 47// back and forth randomly, the expected number of experiments before autotune 48// settles is O(threshold ^ 2). So we recommend that number of warmup runs 49// for any benchmarks. 50template <typename Parameters, typename Config> 51class AutoTuneMap { 52 public: 53 bool Find(const Parameters& params, Config* config) const { 54 mutex_lock lock(mu_); 55 auto iter = params_config_map_.find(params); 56 if (iter == params_config_map_.end() || 57 (iter->second.score < min_score_threshold_ && 58 iter->second.count <= max_autotune_count_)) { 59 return false; 60 } 61 *config = iter->second.config; 62 return true; 63 } 64 void Insert(const Parameters& params, const Config& config) { 65 mutex_lock lock(mu_); 66 auto iter = params_config_map_.find(params); 67 int new_score = 0; 68 if (iter == params_config_map_.end()) { 69 // Create a new entry if params is new. 70 VLOG(1) << GetActionSummary("creates", params, config); 71 params_config_map_.insert( 72 std::make_pair(params, ValueType{config, 1, 1})); 73 new_score = 1; 74 } else if (iter->second.score < min_score_threshold_ && 75 iter->second.count <= max_autotune_count_) { 76 DCHECK_GT(iter->second.score, 0); 77 if (iter->second.config != config) { 78 // If it is different from the current winner, demotes the winner. 79 VLOG(1) << GetActionSummary("demotes", params, config); 80 new_score = --iter->second.score; 81 ++iter->second.count; 82 if (new_score <= 0) { 83 VLOG(1) << GetActionSummary("erases", params, config); 84 params_config_map_.erase(iter); 85 } 86 } else { 87 // If it is the same as the current winner, promotes the winner. 88 VLOG(1) << GetActionSummary("promotes", params, config); 89 new_score = ++iter->second.score; 90 ++iter->second.count; 91 } 92 } 93 if (new_score >= min_score_threshold_) { 94 VLOG(1) << GetActionSummary("accepts", params, config); 95 } 96 } 97 98 private: 99 AutoTuneMap(const string& name) : name_(name) { 100 min_score_threshold_ = 1; 101 int min_warmup_iterations = 10; 102 const char* threshold_str = getenv("TF_AUTOTUNE_THRESHOLD"); 103 if (threshold_str != nullptr) { 104 strings::safe_strto32(threshold_str, &min_score_threshold_); 105 } 106 const char* min_warmup_iteration_str = 107 getenv("TF_AUTOTUNE_MIN_WARMUP_ITERATIONS"); 108 if (min_warmup_iteration_str != nullptr) { 109 strings::safe_strto32(min_warmup_iteration_str, &min_warmup_iterations); 110 } 111 min_score_threshold_ = std::max(min_score_threshold_, 1); 112 max_autotune_count_ = std::max( 113 5 * min_score_threshold_ * min_score_threshold_, min_warmup_iterations); 114 } 115 116 template <class Group, class Params, class Cfg> 117 friend class AutoTuneSingleton; 118 119 struct Hasher { 120 std::size_t operator()(const Parameters& parameter) const { 121 return parameter.hash(); 122 } 123 }; 124 125 string GetActionSummary(StringPiece action, const Parameters& params, 126 const Config& config) { 127 return strings::Printf("autotune_map %s %s: %s -> (%s)", name_.c_str(), 128 action.ToString().c_str(), params.ToString().c_str(), 129 config.ToString().c_str()); 130 } 131 132 mutable mutex mu_; 133 struct ValueType { 134 Config config; 135 int32 score; 136 int32 count; 137 }; 138 std::unordered_map<Parameters, ValueType, Hasher> params_config_map_ 139 GUARDED_BY(mu_); 140 string name_; 141 int32 min_score_threshold_; 142 int32 max_autotune_count_; 143 144 TF_DISALLOW_COPY_AND_ASSIGN(AutoTuneMap); 145}; 146 147// A Singleton helper that manages the global autotune results by groups. 148// The caller specified arbitrary Group type that can distinguish between 149// different autotune results, even if their Parameters and Configs are the 150// same. 151template <class Group, typename Parameters, typename Config> 152class AutoTuneSingleton { 153 public: 154 typedef AutoTuneMap<Parameters, Config> AutoTuneType; 155 static AutoTuneType* GetInstance() { 156 static AutoTuneType* instance = new AutoTuneType(Group::name()); 157 return instance; 158 } 159}; 160 161} // namespace tensorflow 162 163#endif // GOOGLE_CUDA 164 165#endif // TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_ 166