1/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 3Licensed under the Apache License, Version 2.0 (the "License"); 4you may not use this file except in compliance with the License. 5You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9Unless required by applicable law or agreed to in writing, software 10distributed under the License is distributed on an "AS IS" BASIS, 11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12See the License for the specific language governing permissions and 13limitations under the License. 14==============================================================================*/ 15 16#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h" 17#include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h" 18#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" 19#include "tensorflow/core/lib/gtl/optional.h" 20#include "tensorflow/core/lib/strings/numbers.h" 21#include "tensorflow/core/lib/strings/strcat.h" 22 23namespace xla { 24namespace gpu { 25namespace { 26 27namespace se = perftools::gputools; 28 29using se::DeviceMemoryBase; 30using se::dnn::AlgorithmConfig; 31using se::dnn::AlgorithmDesc; 32using tensorflow::gtl::nullopt; 33using tensorflow::gtl::optional; 34 35class ScratchAllocator : public se::ScratchAllocator { 36 public: 37 ScratchAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator) 38 : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {} 39 40 ~ScratchAllocator() override; 41 42 int64 GetMemoryLimitInBytes(se::Stream* stream) override { 43 return 1LL << 32; // 4GB. TODO(jlebar): Tune this? 44 } 45 int64 TotalAllocatedBytes() { return total_allocated_bytes_; } 46 47 se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes( 48 se::Stream* stream, int64 byte_size) override; 49 50 private: 51 const int device_ordinal_; 52 DeviceMemoryAllocator* memory_allocator_; 53 std::vector<se::DeviceMemoryBase> allocated_buffers_; 54 int64 total_allocated_bytes_ = 0; 55}; 56 57ScratchAllocator::~ScratchAllocator() { 58 for (auto& allocated_buffer : allocated_buffers_) { 59 if (!memory_allocator_->Deallocate(device_ordinal_, &allocated_buffer) 60 .ok()) { 61 // The program can still continue with failed deallocation. 62 LOG(ERROR) << "Failed to deallocate the allocated buffer: " 63 << allocated_buffer.opaque(); 64 } 65 } 66} 67 68se::port::StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes( 69 se::Stream* stream, int64 byte_size) { 70 CHECK_GE(byte_size, 0) << "byte_size must be positive."; 71 if (byte_size > GetMemoryLimitInBytes(stream)) { 72 return se::port::Status( 73 se::port::error::RESOURCE_EXHAUSTED, 74 tensorflow::strings::Printf( 75 "Allocating %lld bytes exceeds the memory limit of %lld bytes.", 76 byte_size, GetMemoryLimitInBytes(stream))); 77 } 78 79 auto status_or_memory = 80 memory_allocator_->Allocate(device_ordinal_, byte_size, 81 /*retry_on_failure=*/false); 82 if (!status_or_memory.ok()) { 83 return se::port::Status(se::port::error::RESOURCE_EXHAUSTED, 84 tensorflow::strings::Printf( 85 "Failed to allocate %lld bytes on device %d.", 86 byte_size, device_ordinal_)); 87 } 88 se::DeviceMemoryBase allocated_buffer = status_or_memory.ValueOrDie(); 89 allocated_buffers_.push_back(allocated_buffer); 90 total_allocated_bytes_ += byte_size; 91 return se::DeviceMemory<uint8>(allocated_buffer); 92} 93 94// Determines whether we can safely perform a winograd non-fused convolution for 95// the given input and output shapes. This works around b/68264959, an integer 96// overflow in cuDNNv5 and cuDNNv6. 97// 98// TODO(jlebar): We shouldn't need this check for cuDNNv7. 99bool ShouldIncludeWinogradNonfusedAlgo( 100 const Shape& input_shape, const Shape& output_shape, 101 const ConvolutionDimensionNumbers& dnums) { 102 int64 batch = input_shape.dimensions(dnums.input_batch_dimension()); 103 int64 in_depths = input_shape.dimensions(dnums.input_feature_dimension()); 104 int64 in_rows = input_shape.dimensions(dnums.input_spatial_dimensions(0)); 105 int64 in_cols = 106 dnums.input_spatial_dimensions_size() == 1 107 ? 1 108 : input_shape.dimensions(dnums.input_spatial_dimensions(1)); 109 int64 out_depths = output_shape.dimensions(dnums.output_feature_dimension()); 110 111 int64 total_size = CeilOfRatio(batch, int64{16}) * 112 std::max(in_depths, out_depths) * in_cols * in_rows * 113 sizeof(float); 114 115 const int64 threshold = 1L << 31; 116 return total_size < threshold; 117} 118 119std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind, 120 bool with_winograd_nonfused, 121 se::StreamExecutor* stream_exec_) { 122 std::vector<AlgorithmDesc> algorithms; 123 switch (kind) { 124 case CudnnConvKind::kBackwardFilter: 125 CHECK(stream_exec_->GetConvolveBackwardFilterAlgorithms( 126 with_winograd_nonfused, &algorithms)); 127 break; 128 case CudnnConvKind::kBackwardInput: 129 CHECK(stream_exec_->GetConvolveBackwardDataAlgorithms( 130 with_winograd_nonfused, &algorithms)); 131 break; 132 case CudnnConvKind::kForward: 133 CHECK(stream_exec_->GetConvolveAlgorithms(with_winograd_nonfused, 134 &algorithms)); 135 break; 136 } 137 138 return algorithms; 139} 140 141string AlgorithmToString(const AlgorithmDesc& algo) { 142 if (algo.tensor_ops_enabled()) { 143 return tensorflow::strings::StrCat(algo.algo_id(), "+TC"); 144 } 145 return tensorflow::strings::StrCat(algo.algo_id()); 146} 147 148string NumBytesToString(int64 bytes) { 149 return tensorflow::strings::StrCat( 150 tensorflow::strings::HumanReadableNumBytes(bytes), " (", bytes, "B)"); 151} 152 153} // anonymous namespace 154 155// We could have caching here so that we don't redo this work for two identical 156// convolutions. Unfortunately our cache key would have to be a tuple 157// containing the protos passed to this function, and we have no utility for 158// hashing protos. We could write our own hash functions, but they'd silently 159// break if we ever added a field to one of the protos. Perhaps we could hack 160// using the binary-encoded proto as the hash key, on the assumption that two 161// protos being binary-equal is a sufficient, if not necessary, condition for 162// proper equality. But that would still leave us open to having unnecessary 163// cache misses and doing extra work. Overall, caching doesn't seem worth the 164// trouble, but we may want to revisit this if we ever find a model where 165// caching would speed up compilation a lot. 166optional<std::tuple<int64, bool, int64>> 167CudnnConvolutionAlgorithmPicker::PickBestAlgorithm( 168 CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape, 169 const Shape& output_shape, const Window& window, 170 const ConvolutionDimensionNumbers& dnums, HloInstruction* instr) { 171 // Create a stream for us to do our work on. 172 se::Stream stream{stream_exec_}; 173 stream.Init(); 174 const auto device_ordinal = stream_exec_->device_ordinal(); 175 176 // allocator either points to this->allocator_ or, if that's null, to a 177 // StreamExecutorMemoryAllocator for stream_exec_. 178 DeviceMemoryAllocator* allocator; 179 optional<StreamExecutorMemoryAllocator> se_allocator; 180 if (allocator_ != nullptr) { 181 allocator = allocator_; 182 } else { 183 se_allocator.emplace( 184 stream_exec_->platform(), 185 tensorflow::gtl::ArraySlice<se::StreamExecutor*>({stream_exec_})); 186 allocator = &*se_allocator; 187 } 188 189 // Allocate space for the input, filter, and output of the convolution. We 190 // use a ScratchAllocator for this instead of calling allocator_ directly so 191 // that our allocations don't leak. 192 // 193 // We don't put any data in these buffers, because (in theory, anyway) the 194 // speed of a conv isn't affected by the data being convolved. 195 ScratchAllocator input_output_allocator(device_ordinal, allocator); 196 se::port::StatusOr<DeviceMemoryBase> input_buf = 197 input_output_allocator.AllocateBytes(&stream, 198 ShapeUtil::ByteSizeOf(input_shape)); 199 se::port::StatusOr<DeviceMemoryBase> filter_buf = 200 input_output_allocator.AllocateBytes(&stream, 201 ShapeUtil::ByteSizeOf(filter_shape)); 202 se::port::StatusOr<DeviceMemoryBase> output_buf = 203 input_output_allocator.AllocateBytes(&stream, 204 ShapeUtil::ByteSizeOf(output_shape)); 205 if (!input_buf.ok() || !filter_buf.ok() || !output_buf.ok()) { 206 LOG(WARNING) 207 << "Couldn't allocate space for input/filter/output of convolution " 208 << instr->ToString() << ". Falling back to default algorithm."; 209 return nullopt; 210 } 211 212 const bool use_winograd_nonfused = 213 ShouldIncludeWinogradNonfusedAlgo(input_shape, output_shape, dnums); 214 se::dnn::ProfileResult best_result; 215 int64 best_result_bytes_used = 0; 216 217 for (const AlgorithmDesc& alg : 218 GetAlgorithms(kind, use_winograd_nonfused, stream_exec_)) { 219 ScratchAllocator scratch_allocator(device_ordinal, allocator); 220 se::dnn::ProfileResult profile_result; 221 VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for " 222 << instr->ToString(); 223 224 bool launch_ok = RunCudnnConvolution( 225 kind, input_shape, filter_shape, output_shape, 226 input_buf.ValueOrDie(), filter_buf.ValueOrDie(), 227 output_buf.ValueOrDie(), &scratch_allocator, window, 228 dnums, AlgorithmConfig(alg), &stream, &profile_result) 229 .ok(); 230 231 if (launch_ok && profile_result.is_valid()) { 232 int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes(); 233 VLOG(3) << "Run of algorithm " << AlgorithmToString(alg) 234 << " succeeded, taking " << profile_result.elapsed_time_in_ms() 235 << "ms and using " << NumBytesToString(scratch_bytes_used) 236 << " of scratch (Best result: " 237 << best_result.elapsed_time_in_ms() << "ms, " 238 << NumBytesToString(best_result_bytes_used) << " of scratch)"; 239 if (profile_result.elapsed_time_in_ms() < 240 best_result.elapsed_time_in_ms()) { 241 best_result = profile_result; 242 best_result_bytes_used = scratch_bytes_used; 243 } 244 } else { 245 VLOG(3) << "Run of algorithm " << AlgorithmToString(alg) << " failed."; 246 } 247 } 248 if (best_result.is_valid()) { 249 VLOG(2) << "Best algorithm for " << instr->ToString() << ": " 250 << AlgorithmToString(best_result.algorithm()) << ", takes " 251 << best_result.elapsed_time_in_ms() << "ms, and uses " 252 << best_result_bytes_used << "B of scratch memory."; 253 return std::make_tuple(best_result.algorithm().algo_id(), 254 best_result.algorithm().tensor_ops_enabled(), 255 best_result_bytes_used); 256 } 257 258 LOG(WARNING) << "All algorithms tried for convolution " << instr->ToString() 259 << " failed. Falling back to default algorithm."; 260 return nullopt; 261} 262 263StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnInstruction( 264 HloInstruction* instr) { 265 CHECK(IsCustomCallToDnnConvolution(*instr)); 266 267 const auto& call_target = instr->custom_call_target(); 268 const auto& lhs_shape = instr->operand(0)->shape(); 269 const auto& rhs_shape = instr->operand(1)->shape(); 270 const auto& conv_result_shape = instr->shape().tuple_shapes(0); 271 optional<std::tuple<int64, bool, int64>> alg_scratch_and_tc; 272 if (call_target == kCudnnConvForwardCallTarget) { 273 alg_scratch_and_tc = PickBestAlgorithm( 274 CudnnConvKind::kForward, /*input_shape=*/lhs_shape, 275 /*filter_shape=*/rhs_shape, /*output_shape=*/conv_result_shape, 276 instr->window(), instr->convolution_dimension_numbers(), instr); 277 } else if (call_target == kCudnnConvBackwardInputCallTarget) { 278 alg_scratch_and_tc = PickBestAlgorithm( 279 CudnnConvKind::kBackwardInput, /*input_shape=*/conv_result_shape, 280 /*filter_shape=*/rhs_shape, /*output_shape=*/lhs_shape, instr->window(), 281 instr->convolution_dimension_numbers(), instr); 282 } else if (call_target == kCudnnConvBackwardFilterCallTarget) { 283 alg_scratch_and_tc = PickBestAlgorithm( 284 CudnnConvKind::kBackwardFilter, /*input_shape=*/lhs_shape, 285 /*filter_shape=*/conv_result_shape, /*output_shape=*/rhs_shape, 286 instr->window(), instr->convolution_dimension_numbers(), instr); 287 } else { 288 LOG(FATAL) << "Unknown custom call target for cudnn conv: " 289 << instr->ToString(); 290 } 291 292 if (!alg_scratch_and_tc.has_value()) { 293 return false; 294 } 295 296 int64 algorithm; 297 bool tensor_ops_enabled; 298 int64 scratch_bytes; 299 300 std::tie(algorithm, tensor_ops_enabled, scratch_bytes) = *alg_scratch_and_tc; 301 302 VLOG(1) << "Setting cudnn conv to use algorithm " << algorithm << " and " 303 << NumBytesToString(scratch_bytes) 304 << " of scratch memory: " << instr->ToString() 305 << " tensor_ops_enabled: " << tensor_ops_enabled; 306 307 // Replace instr with a new CustomCall which has the correct algorithm, and 308 // whose output shape has the appropriate amount of scratch memory. 309 HloComputation* computation = instr->parent(); 310 Shape new_call_shape = 311 ShapeUtil::MakeTupleShape({instr->shape().tuple_shapes(0), 312 ShapeUtil::MakeShape(U8, {scratch_bytes})}); 313 HloInstruction* algorithm_hlo = computation->AddInstruction( 314 HloInstruction::CreateConstant(Literal::CreateR0<int64>(algorithm))); 315 HloInstruction* tensor_ops_enabled_hlo = 316 computation->AddInstruction(HloInstruction::CreateConstant( 317 Literal::CreateR0<bool>(tensor_ops_enabled))); 318 319 HloInstruction* new_call = 320 computation->AddInstruction(HloInstruction::CreateCustomCall( 321 new_call_shape, 322 {instr->mutable_operand(0), instr->mutable_operand(1), algorithm_hlo, 323 tensor_ops_enabled_hlo}, 324 instr->custom_call_target())); 325 new_call->set_window(instr->window()); 326 new_call->set_convolution_dimension_numbers( 327 instr->convolution_dimension_numbers()); 328 329 // Repackage new_call so it has the same shape as the original call, namely 330 // (conv_result, u8[0]). 331 HloInstruction* new_tuple = 332 computation->AddInstruction(HloInstruction::CreateTuple( 333 {computation->AddInstruction(HloInstruction::CreateGetTupleElement( 334 new_call_shape.tuple_shapes(0), new_call, 0)), 335 computation->AddInstruction( 336 HloInstruction::CreateConstant(Literal::CreateR1<uint8>({})))})); 337 338 TF_RETURN_IF_ERROR(instr->parent()->ReplaceInstruction(instr, new_tuple)); 339 return true; 340} 341 342StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnComputation( 343 HloComputation* computation) { 344 std::vector<HloInstruction*> convs; 345 for (auto* instr : computation->instructions()) { 346 if (IsCustomCallToDnnConvolution(*instr)) { 347 convs.push_back(instr); 348 } 349 } 350 351 bool changed = false; 352 for (auto* instr : convs) { 353 TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(instr)); 354 changed |= result; 355 } 356 return changed; 357} 358 359StatusOr<bool> CudnnConvolutionAlgorithmPicker::Run(HloModule* module) { 360 bool changed = false; 361 for (HloComputation* computation : module->MakeNonfusionComputations()) { 362 TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation)); 363 changed |= result; 364 } 365 return changed; 366} 367 368} // namespace gpu 369} // namespace xla 370