gpu_device.cc revision 7149a2e2e2f549035f23e21224ee41afe8df3876
1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3Licensed under the Apache License, Version 2.0 (the "License"); 4you may not use this file except in compliance with the License. 5You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9Unless required by applicable law or agreed to in writing, software 10distributed under the License is distributed on an "AS IS" BASIS, 11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12See the License for the specific language governing permissions and 13limitations under the License. 14==============================================================================*/ 15 16// TODO(opensource): Use a more generic sounding preprocessor name than 17// GOOGLE_CUDA 18#if GOOGLE_CUDA 19 20#define EIGEN_USE_GPU 21 22#include "tensorflow/core/common_runtime/gpu/gpu_device.h" 23 24#include <stdlib.h> 25#include <string.h> 26#include <algorithm> 27#include <list> 28#include <map> 29#include <tuple> 30#include <vector> 31 32#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 33#include "tensorflow/core/common_runtime/device_factory.h" 34#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" 35#include "tensorflow/core/common_runtime/gpu/gpu_id.h" 36#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h" 37#include "tensorflow/core/common_runtime/gpu/gpu_init.h" 38#include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h" 39#include "tensorflow/core/common_runtime/gpu/gpu_util.h" 40#include "tensorflow/core/common_runtime/gpu/process_state.h" 41#include "tensorflow/core/common_runtime/gpu_device_context.h" 42#include "tensorflow/core/common_runtime/local_device.h" 43#include "tensorflow/core/framework/allocator.h" 44#include "tensorflow/core/framework/device_base.h" 45#include "tensorflow/core/framework/op_kernel.h" 46#include "tensorflow/core/framework/tensor.h" 47#include "tensorflow/core/framework/tensor.pb.h" 48#include "tensorflow/core/framework/types.h" 49#include "tensorflow/core/framework/variant_op_registry.h" 50#include "tensorflow/core/graph/types.h" 51#include "tensorflow/core/lib/core/errors.h" 52#include "tensorflow/core/lib/core/status.h" 53#include "tensorflow/core/lib/gtl/stl_util.h" 54#include "tensorflow/core/lib/strings/numbers.h" 55#include "tensorflow/core/lib/strings/str_util.h" 56#include "tensorflow/core/lib/strings/strcat.h" 57#include "tensorflow/core/platform/cuda.h" 58#include "tensorflow/core/platform/logging.h" 59#include "tensorflow/core/platform/macros.h" 60#include "tensorflow/core/platform/stream_executor.h" 61#include "tensorflow/core/platform/tracing.h" 62#include "tensorflow/core/platform/types.h" 63#include "tensorflow/core/public/session_options.h" 64#include "tensorflow/core/util/device_name_utils.h" 65#include "tensorflow/core/util/env_var.h" 66#include "tensorflow/core/util/stream_executor_util.h" 67 68namespace tensorflow { 69 70// Eigen Ops directly allocate memory only for temporary buffers used 71// during OpKernel::Compute(). The recommended way of allocating such 72// memory is via OpKernelContext::allocate_temp(). However, Eigen Ops 73// don't have access to OpKernelContext, instead they get access to 74// memory directly through the device allocator. As an Open Source 75// project, Eigen assumes allocator semantics similar to those of the 76// CUDA memory allocator, and may not work correctly due to race 77// conditions if used with some other allocator. For safety, we need 78// to delay deallocation calls out of Eigen until all events on the 79// corresponding stream have completed. The following two classes 80// serve this purpose in two different compilation environments. 81 82class EigenCudaStreamDevice : public ::Eigen::StreamInterface { 83 public: 84 EigenCudaStreamDevice() 85 : scratch_(nullptr), semaphore_(nullptr), context_(nullptr) { 86 Eigen::initializeDeviceProp(); 87 } 88 ~EigenCudaStreamDevice() override {} 89 void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream, 90 TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc, 91 char* scratch) { 92 if (LogMemory::IsEnabled()) { 93 operation_ = context->op_kernel().name() + "/EigenAllocator"; 94 step_id_ = context->step_id(); 95 } 96 context_ = context; 97 scratch_ = scratch; 98 semaphore_ = 99 reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize); 100 stream_ = cuda_stream; 101 allocator_ = alloc; 102 const int cuda_gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id).value(); 103 device_prop_ = &Eigen::m_deviceProperties[cuda_gpu_id]; 104 } 105 106 const cudaStream_t& stream() const override { return *stream_; } 107 const cudaDeviceProp& deviceProperties() const override { 108 return *device_prop_; 109 } 110 111 void* allocate(size_t num_bytes) const override { 112 void* ret = allocator_->AllocateRaw(32 /* alignment */, num_bytes); 113 if (ret == nullptr) { 114 if (context_) { 115 context_->SetStatus(errors::ResourceExhausted( 116 strings::StrCat("Ran out of GPU memory when allocating ", num_bytes, 117 " bytes for ", operation_))); 118 } else { 119 LOG(FATAL) 120 << "EigenAllocator for GPU ran out of memory when allocating " 121 << num_bytes << ". See error logs for more detailed info."; 122 } 123 } 124 if (LogMemory::IsEnabled() && ret != nullptr) { 125 LogMemory::RecordRawAllocation(operation_, step_id_, num_bytes, ret, 126 allocator_); 127 } 128 return ret; 129 } 130 void deallocate(void* buffer) const override { 131 if (LogMemory::IsEnabled() && buffer != nullptr) { 132 LogMemory::RecordRawDeallocation(operation_, step_id_, buffer, allocator_, 133 true); 134 } 135 AsyncFreeData* afData = 136 new AsyncFreeData(allocator_, buffer, operation_, step_id_); 137 cudaError_t err = cudaStreamAddCallback(*stream_, asyncFree, afData, 0); 138 CHECK_EQ(err, cudaSuccess); 139 } 140 141 // Return a pointer to a per stream scratchpad of 1024 bytes residing 142 // in global memory. 143 void* scratchpad() const override { return scratch_; } 144 145 // Return a semaphore. The semaphore is initially initialized to 0, and 146 // each kernel using it is responsible for resetting to 0 upon completion 147 // to maintain the invariant that the semaphore is always equal to 0 upon 148 // each kernel start. 149 unsigned int* semaphore() const override { return semaphore_; } 150 151 private: 152 struct AsyncFreeData { 153 AsyncFreeData(::tensorflow::Allocator* a, void* p, const string& o, 154 const int64 s) 155 : allocator_(a), address_(p), operation_(o), step_id_(s) {} 156 ::tensorflow::Allocator* allocator_; 157 void* address_; 158 const string operation_; 159 const int64 step_id_; 160 }; 161 162 static void CUDART_CB asyncFree(cudaStream_t stream, cudaError_t status, 163 void* userData) { 164 AsyncFreeData* data = static_cast<AsyncFreeData*>(userData); 165 if (LogMemory::IsEnabled()) { 166 LogMemory::RecordRawDeallocation(data->operation_, data->step_id_, 167 data->address_, data->allocator_, false); 168 } 169 data->allocator_->DeallocateRaw(data->address_); 170 delete data; 171 } 172 173 string operation_; 174 int64 step_id_; 175 const cudaStream_t* stream_; // Not owned. 176 const cudaDeviceProp* device_prop_; // Not owned. 177 ::tensorflow::Allocator* allocator_; // Not owned. 178 mutable char* scratch_; 179 mutable unsigned int* semaphore_; 180 OpKernelContext* context_; 181 182 TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice); 183}; 184 185// This factory helps to ensure that different GPU device objects that refer to 186// the same physical device and stream group id use the same stream group 187// object (and therefore the same CUDA streams). This is necessary since there 188// is a single memory allocator per device (see ProcessState::GetGPUAllocator) 189// and allocators must not be shared across streams. 190class BaseGPUDevice::StreamGroupFactory { 191 public: 192 // Returns the unique stream group for use with the stream defined by 193 // {tf_gpu_id, stream_group_within_gpu}, creating it if it does not yet 194 // exist. 195 // This function is thread safe. 196 BaseGPUDevice::StreamGroup* GetOrCreate(TfGpuId tf_gpu_id, 197 int stream_group_within_gpu, 198 gpu::StreamExecutor* executor) { 199 mutex_lock guard(lock_); 200 StreamGroup* group = 201 &streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)]; 202 if (!group->compute) { 203 group->compute = new gpu::Stream(executor); 204 group->compute->Init(); 205 VLOG(2) << "Created stream[" << stream_group_within_gpu 206 << "] = " << group->compute; 207 208 group->host_to_device = new gpu::Stream(executor); 209 group->host_to_device->Init(); 210 VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu 211 << "] = " << group->host_to_device; 212 213 group->device_to_host = new gpu::Stream(executor); 214 group->device_to_host->Init(); 215 VLOG(2) << "Created device_to_host_stream[" << stream_group_within_gpu 216 << "] = " << group->device_to_host; 217 218 group->device_to_device = new gpu::Stream(executor); 219 group->device_to_device->Init(); 220 VLOG(2) << "Created device_to_device_stream[" << stream_group_within_gpu 221 << "] = " << group->device_to_host; 222 } 223 return group; 224 } 225 226 // Returns a reference to the StreamGroupFactory singleton. Note that this is 227 // never destroyed, so the objects it owns are never deleted. 228 static StreamGroupFactory& Global() { 229 static StreamGroupFactory* instance = new StreamGroupFactory(); 230 return *instance; 231 } 232 233 private: 234 mutex lock_; 235 using key_type = std::tuple<int, int>; 236 std::map<key_type, StreamGroup> streams_; 237 238 // StreamGroupFactory cannot be created directly; Call 239 // StreamGroupFactory::Global() to get the global instance. 240 StreamGroupFactory() = default; 241 TF_DISALLOW_COPY_AND_ASSIGN(StreamGroupFactory); 242}; 243 244BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name, 245 Bytes memory_limit, const DeviceLocality& locality, 246 TfGpuId tf_gpu_id, 247 const string& physical_device_desc, 248 Allocator* gpu_allocator, Allocator* cpu_allocator, 249 bool sync_every_op, int32 max_streams) 250 : LocalDevice(options, Device::BuildDeviceAttributes(name, DEVICE_GPU, 251 memory_limit, locality, 252 physical_device_desc)), 253 gpu_allocator_(gpu_allocator), 254 cpu_allocator_(cpu_allocator), 255 tf_gpu_id_(tf_gpu_id), 256 sync_every_op_(sync_every_op), 257 max_streams_(max_streams) { 258 ProcessState::singleton()->EnableGPUDevice(); 259} 260 261BaseGPUDevice::~BaseGPUDevice() { 262 delete gpu_device_info_; 263 for (auto ctx : device_contexts_) ctx->Unref(); 264} 265 266Status BaseGPUDevice::Init(const SessionOptions& options) { 267 auto executor_status = GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id_); 268 if (!executor_status.status().ok()) { 269 return errors::Internal("Failed to get StreamExecutor for device ", 270 tf_gpu_id_.value()); 271 } 272 273 executor_ = executor_status.ValueOrDie(); 274 em_.reset(new EventMgr(executor_, options.config.gpu_options())); 275 276 if (max_streams_ < 1) { 277 return errors::InvalidArgument("Invalid value for max_streams."); 278 } 279 280 // Create the specified number of GPU streams 281 for (int i = 0; i < max_streams_; i++) { 282 streams_.push_back( 283 StreamGroupFactory::Global().GetOrCreate(tf_gpu_id_, i, executor_)); 284 285 size_t scratch_buffer_size = Eigen::kCudaScratchSize + sizeof(unsigned int); 286 void* scratch_buffer = gpu_allocator_->AllocateRaw( 287 Allocator::kAllocatorAlignment, scratch_buffer_size); 288 if (scratch_buffer == nullptr) { 289 return errors::FailedPrecondition( 290 "Failed to allocate scratch buffer for device ", tf_gpu_id_.value()); 291 } 292 scratch_.push_back(static_cast<char*>(scratch_buffer)); 293 294 perftools::gputools::DeviceMemory<char> mem( 295 perftools::gputools::DeviceMemoryBase(scratch_buffer, 296 scratch_buffer_size)); 297 298 bool ok = executor_->SynchronousMemZero( 299 &mem, Eigen::kCudaScratchSize + sizeof(unsigned int)); 300 if (!ok) { 301 return errors::FailedPrecondition( 302 "Failed to memcopy into scratch buffer for device ", 303 tf_gpu_id_.value()); 304 } 305 306 device_contexts_.push_back(new GPUDeviceContext( 307 i, streams_.back()->compute, streams_.back()->host_to_device, 308 streams_.back()->device_to_host, streams_.back()->device_to_device)); 309 } 310 gpu_device_info_ = new GpuDeviceInfo; 311 gpu_device_info_->stream = streams_[0]->compute; 312 gpu_device_info_->default_context = device_contexts_[0]; 313 gpu_device_info_->event_mgr = em_.get(); 314 gpu_device_info_->gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id_).value(); 315 set_tensorflow_gpu_device_info(gpu_device_info_); 316 317 // Whether and how the GPU device uses its own threadpool. 318 // This option is experimental. Once we confirm the best setting, we 319 // may change the default behavior and completely remove this flag. 320 // Default values might change in future releases. 321 // Possible values: 322 // * global: GPU uses threads shared with CPU in the main compute 323 // thread-pool. This is currently the default. 324 // * gpu_private: GPU uses threads dedicated to this device. 325 // * gpu_shared: All GPUs share a dedicated thread pool. 326 string gpu_thread_mode; 327 TF_RETURN_IF_ERROR( 328 ReadStringFromEnvVar("TF_GPU_THREAD_MODE", "global", &gpu_thread_mode)); 329 gpu_thread_mode = str_util::Lowercase(gpu_thread_mode); 330 if (gpu_thread_mode != "global") { 331 int64 gpu_thread_count = -1; 332 // Default to two threads. One for device compute and another for memory 333 // copies. 334 TF_RETURN_IF_ERROR( 335 ReadInt64FromEnvVar("TF_GPU_THREAD_COUNT", 2, &gpu_thread_count)); 336 if (gpu_thread_mode == "gpu_private") { 337 // TODO(zhengxq): since these threads only serve a single GPU device, 338 // we should set the device context once for each thread, and avoid 339 // setting them for each kernel. 340 // TODO(zhengxq): pin the thread to the same socket of the target GPU. 341 thread_pool_.reset(new thread::ThreadPool( 342 options.env, strings::StrCat("gpu_private_", tf_gpu_id_.value()), 343 static_cast<int32>(gpu_thread_count))); 344 set_tensorflow_device_thread_pool(thread_pool_.get()); 345 } else if (gpu_thread_mode == "gpu_shared") { 346 static thread::ThreadPool* thread_pool = new thread::ThreadPool( 347 options.env, "gpu_shared", static_cast<int32>(gpu_thread_count)); 348 set_tensorflow_device_thread_pool(thread_pool); 349 } else { 350 string error_message = 351 strings::StrCat("Invalid gpu_thread_mode: ", gpu_thread_mode); 352 LOG(WARNING) << error_message; 353 return errors::InvalidArgument(error_message); 354 } 355 } 356 357 return Status::OK(); 358} 359 360bool BaseGPUDevice::RequiresRecordingAccessedTensors() const { 361 // When there is no more than one stream, we release the tensor reference 362 // at the end of the kernel launch, instead of at the end of the kernel 363 // execution. 364 return streams_.size() > 1; 365} 366 367Status BaseGPUDevice::FillContextMap(const Graph* graph, 368 DeviceContextMap* device_context_map) { 369 VLOG(2) << "FillContextMap"; 370 371 const size_t num_streams = streams_.size(); 372 // Special case for single stream. 373 if (num_streams == 1) { 374 return Status::OK(); 375 } 376 const int64 before = Env::Default()->NowMicros(); 377 gpu_stream_util::AssignStreamsOpts opts; 378 opts.max_streams = static_cast<int32>(num_streams); 379 std::unordered_map<int, int> node_to_stream_id; 380 TF_RETURN_IF_ERROR( 381 gpu_stream_util::AssignStreams(graph, opts, &node_to_stream_id)); 382 int64 elapsed = Env::Default()->NowMicros() - before; 383 VLOG(3) << "AssignStreams took " << elapsed << "us"; 384 385 // Fill in the context map. It is OK for this map to contain 386 // duplicate DeviceContexts so long as we increment the refcount. 387 device_context_map->resize(graph->num_node_ids()); 388 for (Node* n : graph->nodes()) { 389 auto mapped_stream = node_to_stream_id[n->id()]; 390 CHECK_LE(mapped_stream, num_streams); 391 auto ctx = device_contexts_[mapped_stream]; 392 VLOG(3) << "Assigned stream " << node_to_stream_id[n->id()] 393 << " ==> stream[" << ctx->stream_id() << "] for node id " << n->id() 394 << " " << n->type_string() << " " << n->name(); 395 ctx->Ref(); 396 (*device_context_map)[n->id()] = ctx; 397 } 398 399 return Status::OK(); 400} 401 402void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) { 403 // ScopedActivity is cheap when tracing is not active, but we 404 // can avoid computing the Hash64. 405 // TODO(pbar) This would no longer be needed if Ops have a unique id. 406 const uint64 id = port::Tracing::IsActive() ? Hash64(op_kernel->name()) : 0; 407 port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute, 408 id); 409 410 // NOTE(tucker): We need to discriminate between Eigen GPU 411 // operations and all others. If an operation is Eigen 412 // implemented (or otherwise tries to launch a cuda kernel 413 // directly), we need to establish a stacked-scoped environment 414 // that directs it to execute on the proper device. Otherwise we 415 // expect the Op to use StreamExecutor directly and correctly. The 416 // way we make this discrimination is quite hacky: At the moment 417 // the only non-Eigen GPU Op is the recv-op, which is known to be 418 // asynchronous. 419 if (op_kernel->is_internal() && op_kernel->type_string() == "_Recv") { 420 context->SetStatus(errors::Internal( 421 "Invalid synchronous 'Compute' on GPU for '_Recv' op")); 422 } else if (port::Tracing::ScopedAnnotation::Enabled()) { 423 port::Tracing::ScopedAnnotation annotation(op_kernel->name(), 424 op_kernel->type_string()); 425 ComputeHelper(op_kernel, context); 426 } else { 427 ComputeHelper(op_kernel, context); 428 } 429} 430 431void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel, 432 OpKernelContext* context) { 433 GPUDeviceContext* gpu_device_context = device_contexts_[0]; 434 if (context->op_device_context() != nullptr) { 435 gpu_device_context = 436 static_cast<GPUDeviceContext*>(context->op_device_context()); 437 } 438 gpu::Stream* stream = gpu_device_context->stream(); 439 const auto stream_id = gpu_device_context->stream_id(); 440 441 const bool vlog_1 = VLOG_IS_ON(1); 442 const bool vlog_2 = vlog_1 && VLOG_IS_ON(2); 443 444 if (vlog_1) { 445 VLOG(1) << "GpuDevice::Compute " << op_kernel->name() << " op " 446 << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream[" 447 << stream_id << "]"; 448 } 449 450 const auto num_streams = streams_.size(); 451 if (num_streams > 1) { 452 // If this op's device context is different from the other contexts, 453 // we must wait on the stream. 454 for (int i = 0; i < context->num_inputs(); ++i) { 455 const GPUDeviceContext* idc = 456 static_cast<GPUDeviceContext*>(context->input_device_context(i)); 457 OP_REQUIRES(context, idc != nullptr, 458 errors::Internal("Input device context ", i, 459 " was not set properly.")); 460 if (vlog_2) { 461 const void* base; 462 size_t len; 463 if (context->has_input(i)) { 464 if (IsRefType(context->input_dtype(i))) { 465 Tensor tensor = context->mutable_input(i, false); 466 base = DMAHelper::base(&tensor); 467 len = tensor.TotalBytes(); 468 } else { 469 const Tensor& tensor = context->input(i); 470 base = DMAHelper::base(&tensor); 471 len = tensor.TotalBytes(); 472 } 473 LOG(INFO) << "Input " << i << " " << base << " " << len; 474 LOG(INFO) << " stream[" << stream_id << "].ThenWaitFor(stream[" 475 << idc->stream_id() << "])" 476 << ((idc->stream() == stream) ? " not needed" : ""); 477 } 478 } 479 if (idc->stream() != stream) stream->ThenWaitFor(idc->stream()); 480 } 481 } 482 gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()}; 483 op_kernel->Compute(context); 484 if (context->status().ok()) { 485 if (sync_every_op_) { 486 // Note: GPUUtil::Sync() only syncs the default stream. 487 // We need to either sync the stream used by this op, or 488 // all streams. Given that this flag is typically used for 489 // debugging it makes more sense to sync all GPU activity. 490 context->SetStatus(GPUUtil::SyncAll(this)); 491 } 492 } 493} 494 495void BaseGPUDevice::ConsumeListOfAccessedTensors( 496 DeviceContext* device_context, const TensorReferenceVector& tensor_refs) { 497 GPUDeviceContext* gpu_device_context = device_contexts_[0]; 498 if (device_context != nullptr) { 499 gpu_device_context = static_cast<GPUDeviceContext*>(device_context); 500 } 501 gpu::Stream* stream = gpu_device_context->stream(); 502 em_->ThenDeleteTensors(stream, tensor_refs); 503} 504 505// Based on the semantics of Device::Sync this call should wait for 506// all streams not just the current one. 507Status BaseGPUDevice::Sync() { return GPUUtil::SyncAll(this); } 508 509void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel, 510 OpKernelContext* context, 511 AsyncOpKernel::DoneCallback done) { 512 GPUDeviceContext* gpu_device_context = device_contexts_[0]; 513 if (context->op_device_context() != nullptr) { 514 gpu_device_context = 515 static_cast<GPUDeviceContext*>(context->op_device_context()); 516 } 517 gpu::Stream* stream = gpu_device_context->stream(); 518 const auto stream_id = gpu_device_context->stream_id(); 519 520 VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op " 521 << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream[" 522 << stream_id << "]"; 523 524 // When TraceMe profiling is off (which is the default), the 525 // following TraceMe constructor is simply a conditional test of 526 // false value. Measurements show that its overhead is negligible. 527 port::Tracing::TraceMe activity(op_kernel->name(), op_kernel->type_string(), 528 op_kernel->IsExpensive()); 529 gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()}; 530 op_kernel->ComputeAsync(context, done); 531} 532 533Status BaseGPUDevice::MaybeCopyTensorToGPU( 534 const AllocatorAttributes& alloc_attrs, const Tensor& from, Tensor* to, 535 StatusCallback done) { 536 if (alloc_attrs.on_host()) { 537 *to = from; 538 done(Status::OK()); 539 return Status::OK(); 540 } else { 541 if (!DMAHelper::CanUseDMA(&from)) { 542 Status err = errors::Internal("GPU copy from non-DMA ", 543 DataTypeString(from.dtype()), " tensor"); 544 done(err); 545 return err; 546 } 547 auto* copy = 548 new Tensor(GetAllocator(alloc_attrs), from.dtype(), from.shape()); 549 550 // If the tensor is not initialized, we likely ran out of memory. 551 if (!copy->IsInitialized()) { 552 delete copy; 553 Status err = errors::ResourceExhausted( 554 "OOM when allocating tensor of shape ", from.shape().DebugString(), 555 " and type ", DataTypeString(from.dtype())); 556 done(err); 557 return err; 558 } 559 560 StatusCallback wrapped_done = std::bind( 561 [to, copy](StatusCallback done_, 562 // Begin unbound arguments. 563 const Status& s) { 564 *to = std::move(*copy); 565 delete copy; 566 done_(s); 567 }, 568 std::move(done), std::placeholders::_1); 569 570 port::Tracing::ScopedAnnotation annotation("MakeTensorFromProto"); 571 device_contexts_[0]->CopyCPUTensorToDevice(&from, this, copy, 572 std::move(wrapped_done)); 573 return Status::OK(); 574 } 575} 576 577Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto, 578 const AllocatorAttributes alloc_attrs, 579 Tensor* tensor) { 580 AllocatorAttributes attr; 581 attr.set_on_host(true); 582 attr.set_gpu_compatible(true); 583 Allocator* host_alloc = GetAllocator(attr); 584 Tensor parsed(tensor_proto.dtype()); 585 if (!parsed.FromProto(host_alloc, tensor_proto)) { 586 return errors::InvalidArgument("Cannot parse tensor from proto: ", 587 tensor_proto.DebugString()); 588 } 589 590 if (parsed.dtype() == DT_VARIANT) { 591 const Variant* from = parsed.flat<Variant>().data(); 592 Tensor copy(cpu_allocator(), DT_VARIANT, parsed.shape()); 593 Variant* copy_variant = copy.flat<Variant>().data(); 594 595 std::list<Notification> notifications; 596 Status copy_status; 597 auto copier = [this, &alloc_attrs, ¬ifications, ©_status]( 598 const Tensor& from, Tensor* to) { 599 // Copier isn't run in a multithreaded environment, so we don't 600 // have to worry about the notifications list being modified in parallel. 601 notifications.emplace_back(); 602 Notification& n = *notifications.rbegin(); 603 return MaybeCopyTensorToGPU(alloc_attrs, from, to, 604 [&n, ©_status](const Status& s) { 605 if (copy_status.ok()) { 606 copy_status.Update(s); 607 } 608 n.Notify(); 609 }); 610 }; 611 Status s; 612 for (int64 ix = 0; ix < parsed.NumElements(); ++ix) { 613 s = VariantDeviceCopy(VariantDeviceCopyDirection::HOST_TO_DEVICE, 614 from[ix], ©_variant[ix], copier); 615 if (!s.ok()) { 616 break; 617 } 618 } 619 for (auto& n : notifications) { 620 n.WaitForNotification(); 621 } 622 if (!s.ok()) { 623 return s; 624 } 625 *tensor = std::move(copy); 626 return copy_status; 627 } else { 628 Notification n; 629 Status status; 630 TF_RETURN_IF_ERROR(MaybeCopyTensorToGPU(alloc_attrs, parsed, tensor, 631 [&n, &status](const Status& s) { 632 status = s; 633 n.Notify(); 634 })); 635 n.WaitForNotification(); 636 return status; 637 } 638} 639 640namespace { 641class ConcretePerOpGpuDevice : public PerOpGpuDevice { 642 public: 643 ConcretePerOpGpuDevice() : device_(&stream_device_) {} 644 645 void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream, 646 TfGpuId tf_gpu_id, Allocator* base_allocator, 647 char* scratch) { 648 stream_device_.Reinitialize(context, cuda_stream, tf_gpu_id, base_allocator, 649 scratch); 650 } 651 652 const Eigen::GpuDevice& device() const override { return device_; } 653 654 private: 655 EigenCudaStreamDevice stream_device_; 656 Eigen::GpuDevice device_; 657}; 658 659// Parse 'visible_device_list' into a list of CUDA GPU ids. 660Status ParseVisibleDeviceList(const string& visible_device_list, 661 std::vector<CudaGpuId>* visible_gpu_order) { 662 visible_gpu_order->clear(); 663 gpu::Platform* gpu_manager = GPUMachineManager(); 664 665 // If the user wants to remap the visible to virtual GPU mapping, 666 // check for that here. 667 if (visible_device_list.empty()) { 668 visible_gpu_order->resize(gpu_manager->VisibleDeviceCount()); 669 // By default, visible to virtual mapping is unchanged. 670 int deviceNo = 0; 671 std::generate(visible_gpu_order->begin(), visible_gpu_order->end(), 672 [&deviceNo] { return deviceNo++; }); 673 } else { 674 const std::vector<string> order_str = 675 str_util::Split(visible_device_list, ','); 676 for (const string& cuda_gpu_id_str : order_str) { 677 int32 cuda_gpu_id; 678 if (!strings::safe_strto32(cuda_gpu_id_str, &cuda_gpu_id)) { 679 return errors::InvalidArgument( 680 "Could not parse entry in 'visible_device_list': '", 681 cuda_gpu_id_str, "'. visible_device_list = ", visible_device_list); 682 } 683 if (cuda_gpu_id < 0 || cuda_gpu_id >= gpu_manager->VisibleDeviceCount()) { 684 return errors::InvalidArgument( 685 "'visible_device_list' listed an invalid GPU id '", cuda_gpu_id, 686 "' but visible device count is ", 687 gpu_manager->VisibleDeviceCount()); 688 } 689 visible_gpu_order->push_back(CudaGpuId(cuda_gpu_id)); 690 } 691 } 692 693 // Validate no repeats. 694 std::set<CudaGpuId> visible_device_set(visible_gpu_order->begin(), 695 visible_gpu_order->end()); 696 if (visible_device_set.size() != visible_gpu_order->size()) { 697 return errors::InvalidArgument( 698 "visible_device_list contained a duplicate entry: ", 699 visible_device_list); 700 } 701 return Status::OK(); 702} 703 704Status VerifyVirtualDeviceSettings( 705 const size_t num_gpus_to_use, const GPUOptions& gpu_options, 706 const std::vector<CudaGpuId>& visible_gpu_order, 707 const std::vector<CudaGpuId>& valid_cuda_gpu_ids) { 708 const auto& virtual_devices = gpu_options.experimental().virtual_devices(); 709 CHECK(!virtual_devices.empty()); 710 if (gpu_options.per_process_gpu_memory_fraction() > 0) { 711 return errors::InvalidArgument( 712 "It's invalid to set per_process_gpu_memory_fraction when " 713 "virtual_devices is set."); 714 } 715 if (num_gpus_to_use < virtual_devices.size()) { 716 return errors::Unknown( 717 "Not enough GPUs to create virtual devices." 718 " num_gpus_to_use: ", 719 num_gpus_to_use, " #virtual_devices: ", virtual_devices.size()); 720 } 721 if (!gpu_options.visible_device_list().empty() && 722 visible_gpu_order.size() != virtual_devices.size()) { 723 return errors::InvalidArgument( 724 "The number of GPUs in visible_device_list doesn't match the number " 725 "of elements in the virtual_devices list.", 726 " #GPUs in visible_device_list: ", visible_gpu_order.size(), 727 " virtual_devices.size(): ", virtual_devices.size()); 728 } 729 if (valid_cuda_gpu_ids.size() != virtual_devices.size()) { 730 return errors::Unknown( 731 "The number of valid GPUs doesn't match the number of elements in " 732 "the virtual_devices list.", 733 " #valid GPUs: ", valid_cuda_gpu_ids.size(), 734 " virtual_devices.size(): ", virtual_devices.size()); 735 } 736 return Status::OK(); 737} 738 739int64 MinSystemMemory(int64 available_memory) { 740 // We use the following heuristic for now: 741 // 742 // If the available_memory is < 2GiB, we allocate 225MiB to system memory. 743 // Otherwise, allocate max(300MiB, 0.05 * available_memory) to system memory. 744 // 745 // In the future we could be more sophisticated by using a table of devices. 746 int64 min_system_memory; 747 if (available_memory < (1LL << 31)) { 748 // 225MiB 749 min_system_memory = 225 * 1024 * 1024; 750 } else { 751 // max(300 MiB, 0.05 * available_memory) 752 min_system_memory = 753 std::max(314572800LL, static_cast<int64>(available_memory * 0.05)); 754 } 755#if defined(__GNUC__) && defined(__OPTIMIZE__) 756// Do nothing 757#elif !defined(__GNUC__) && defined(NDEBUG) 758// Do nothing 759#else 760 // Double the amount of available GPU memory in non-opt builds (debug 761 // builds in windows); because in non-opt builds more system memory 762 // is necessary. 763 min_system_memory *= 2; 764#endif 765#if defined(NVIDIA_TEGRA) 766 // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM 767 // and Video RAM 768 min_system_memory = 1 << 30; 769#endif 770 return min_system_memory; 771} 772 773// Get the memory limit for the virtual device being created on GPU with 774// 'cuda_gpu_id', when that virtual device is the only virtual device being 775// created on that GPU. 776Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options, 777 CudaGpuId cuda_gpu_id, 778 int64* memory_limit) { 779 int64 total_memory = 0; 780 int64 available_memory = 0; 781 gpu::StreamExecutor* se = 782 GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); 783 if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) { 784 return errors::Unknown("Failed to query available memory for GPU ", 785 cuda_gpu_id.value()); 786 } 787 788 int64 allocated_memory = 0; 789 const double per_process_gpu_memory_fraction = 790 gpu_options.per_process_gpu_memory_fraction(); 791 if (per_process_gpu_memory_fraction == 0) { 792 allocated_memory = available_memory; 793 const int64 min_system_memory = MinSystemMemory(available_memory); 794 if (min_system_memory < allocated_memory) { 795 allocated_memory -= min_system_memory; 796 } 797 } else { 798 allocated_memory = total_memory * per_process_gpu_memory_fraction; 799 } 800 *memory_limit = allocated_memory; 801 return Status::OK(); 802} 803} // namespace 804 805void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context, 806 PerOpGpuDevice* device, int stream_id, 807 Allocator* allocator) { 808 ConcretePerOpGpuDevice* concrete_device = 809 static_cast<ConcretePerOpGpuDevice*>(device); 810 DCHECK(concrete_device); 811 const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>( 812 streams_[stream_id]->compute->implementation()->CudaStreamMemberHack()); 813 concrete_device->Reinitialize(context, cuda_stream, tf_gpu_id_, allocator, 814 scratch_[stream_id]); 815} 816 817PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice() { 818 return new ConcretePerOpGpuDevice(); 819} 820 821void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context, 822 PerOpGpuDevice* device, 823 DeviceContext* dc, 824 Allocator* allocator) { 825 if (dc) { 826 const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc); 827 const int stream_id = gpu_dc->stream_id(); 828 VLOG(1) << " eigen_gpu_device(" << dc << ") => stream[" << stream_id 829 << "]"; 830 CHECK_LT(stream_id, streams_.size()); 831 ReinitializeDevice(context, device, stream_id, allocator); 832 } else { 833 ReinitializeDevice(context, device, 0, allocator); 834 } 835} 836 837Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options, 838 const string& name_prefix, 839 std::vector<Device*>* devices) { 840 TF_RETURN_IF_ERROR(ValidateGPUMachineManager()); 841 gpu::Platform* gpu_manager = GPUMachineManager(); 842 if (gpu_manager == nullptr) { 843 return Status::OK(); 844 } 845 // If there are no GPUs visible, do nothing. 846 if (gpu_manager->VisibleDeviceCount() <= 0) { 847 return Status::OK(); 848 } 849 850 size_t num_gpus_to_use = INT_MAX; 851 auto iter = options.config.device_count().find("GPU"); 852 if (iter != options.config.device_count().end()) { 853 num_gpus_to_use = iter->second; 854 } 855 const auto& gpu_options = options.config.gpu_options(); 856 std::vector<CudaGpuId> visible_gpu_order; 857 TF_RETURN_IF_ERROR(ParseVisibleDeviceList(gpu_options.visible_device_list(), 858 &visible_gpu_order)); 859 860 std::vector<CudaGpuId> valid_cuda_gpu_ids; 861 TF_RETURN_IF_ERROR(GetValidDeviceIds(visible_gpu_order, &valid_cuda_gpu_ids)); 862 if (num_gpus_to_use > valid_cuda_gpu_ids.size()) { 863 num_gpus_to_use = valid_cuda_gpu_ids.size(); 864 } 865 if (!valid_cuda_gpu_ids.empty()) { 866 // Save the original device. 867 int original_device = 0; 868 cudaError_t err = cudaGetDevice(&original_device); 869 if (err != cudaSuccess) { 870 return errors::Internal("cudaGetDevice() failed. Status: ", 871 cudaGetErrorString(err)); 872 } 873 // Force to implicitly initialize CUDA runtime on each valid GPU before 874 // CreateGPUDevice(). 875 for (CudaGpuId cuda_gpu_id : valid_cuda_gpu_ids) { 876 err = cudaSetDevice(cuda_gpu_id.value()); 877 if (err != cudaSuccess) { 878 return errors::Internal("cudaSetDevice() on GPU:", cuda_gpu_id.value(), 879 " failed. Status: ", cudaGetErrorString(err)); 880 } 881 err = cudaFree(nullptr); 882 if (err != cudaSuccess) { 883 return errors::Internal( 884 "CUDA runtime implicit initialization on GPU:", cuda_gpu_id.value(), 885 " failed. Status: ", cudaGetErrorString(err)); 886 } 887 } 888 // Reset to the original device. 889 err = cudaSetDevice(original_device); 890 if (err != cudaSuccess) { 891 return errors::Internal("cudaSetDevice() on GPU:", original_device, 892 " failed. Status: ", cudaGetErrorString(err)); 893 } 894 } 895 896 const auto& virtual_devices = gpu_options.experimental().virtual_devices(); 897 if (!virtual_devices.empty()) { 898 TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings( 899 num_gpus_to_use, gpu_options, visible_gpu_order, valid_cuda_gpu_ids)); 900 // We've verified that num_gpus_to_use >= virtual_devices.size(). 901 num_gpus_to_use = virtual_devices.size(); 902 CHECK(gpu_options.visible_device_list().empty() || 903 valid_cuda_gpu_ids == visible_gpu_order); 904 } 905 int next_tf_gpu_id = 0; 906 for (int i = 0; i < num_gpus_to_use; ++i) { 907 const CudaGpuId cuda_gpu_id = valid_cuda_gpu_ids[i]; 908 std::vector<int64> memory_limit_bytes; 909 if (virtual_devices.empty() || 910 virtual_devices.Get(i).memory_limit_mb_size() == 0) { 911 int64 single_virtual_device_memory_limit = 0; 912 TF_RETURN_IF_ERROR(SingleVirtualDeviceMemoryLimit( 913 gpu_options, cuda_gpu_id, &single_virtual_device_memory_limit)); 914 memory_limit_bytes.push_back(single_virtual_device_memory_limit); 915 } else { 916 const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb(); 917 std::transform(memory_limit_mb.begin(), memory_limit_mb.end(), 918 std::back_inserter(memory_limit_bytes), [](float mb) { 919 return static_cast<int64>(mb) * (1ll << 20); 920 }); 921 } 922 for (int64 bytes : memory_limit_bytes) { 923 TfGpuId tf_gpu_id(next_tf_gpu_id); 924 ++next_tf_gpu_id; 925 GpuIdUtil::InsertTfCudaGpuIdPair(tf_gpu_id, cuda_gpu_id); 926 TF_RETURN_IF_ERROR( 927 CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes, devices)); 928 } 929 } 930 return Status::OK(); 931} 932 933static string GetShortDeviceDescription(CudaGpuId cuda_gpu_id, 934 const gpu::DeviceDescription& desc) { 935 int cc_major; 936 int cc_minor; 937 if (!desc.cuda_compute_capability(&cc_major, &cc_minor)) { 938 cc_major = 0; 939 cc_minor = 0; 940 } 941 // LINT.IfChange 942 return strings::StrCat("device: ", cuda_gpu_id.value(), 943 ", name: ", desc.name(), 944 ", pci bus id: ", desc.pci_bus_id(), 945 ", compute capability: ", cc_major, ".", cc_minor); 946 // LINT.ThenChange(//tensorflow/python/platform/test.py) 947} 948 949Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options, 950 const string& name_prefix, 951 TfGpuId tf_gpu_id, 952 int64 memory_limit, 953 std::vector<Device*>* devices) { 954 CHECK_GE(tf_gpu_id.value(), 0); 955 const string device_name = 956 strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value()); 957 958 // Look up the device, to see its attributes. 959 GpuIdUtil::CheckValidTfGpuId(tf_gpu_id); 960 gpu::StreamExecutor* se = 961 GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie(); 962 const gpu::DeviceDescription& desc = se->GetDeviceDescription(); 963 int numa_node = desc.numa_node(); 964 if (numa_node < 0) { 965 // For some reason the StreamExecutor couldn't get the NUMA 966 // affinity of the GPU. If this is not a multi-socket mobo with 967 // GPUs local to different buses, it doesn't matter. If it is, we 968 // may run into trouble later with data transfer operations. The 969 // trouble may manifest as slower than expected performance, or 970 // outright failures. 971 LOG(INFO) << "Could not identify NUMA node of " << device_name 972 << ", defaulting to 0. Your kernel may not have been built " 973 << "with NUMA support."; 974 numa_node = 0; 975 } 976 Bytes allocated_bytes = static_cast<Bytes>(memory_limit); 977 978 // Get GPU bus_id from its reported NUMA affinity. Because GPUs are 979 // virtualized in some environments, we can't just use the GPU id. 980 // NUMA locales are indexed from 0, buses are indexed from 1. 981 DeviceLocality dev_locality; 982 dev_locality.set_bus_id(numa_node + 1); 983 const CudaGpuId cuda_gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id); 984 VLOG(1) << "GPUDevice id " << cuda_gpu_id << " on bus " 985 << dev_locality.bus_id() << " numa: " << numa_node 986 << " pci: " << desc.pci_bus_id(); 987 988 LOG(INFO) << "Creating TensorFlow device (" << device_name << " with " 989 << (memory_limit >> 20) << " MB memory) -> physical GPU (" 990 << GetShortDeviceDescription(cuda_gpu_id, desc) << ")"; 991 ProcessState* process_state = ProcessState::singleton(); 992 BaseGPUDevice* gpu_device = CreateGPUDevice( 993 options, device_name, allocated_bytes, dev_locality, tf_gpu_id, 994 GetShortDeviceDescription(cuda_gpu_id, desc), 995 process_state->GetGPUAllocator(options.config.gpu_options(), tf_gpu_id, 996 memory_limit), 997 process_state->GetCPUAllocator(numa_node)); 998 TF_RETURN_IF_ERROR(gpu_device->Init(options)); 999 devices->push_back(gpu_device); 1000 1001 return Status::OK(); 1002} 1003 1004static int GetDefaultMinGPUMultiprocessorCount( 1005 gpu::Platform* gpu_manager, 1006 const std::vector<CudaGpuId>& visible_gpu_order) { 1007 static const int kDefaultMinGPUMultiprocessorCount = 8; 1008 1009 // Find the highest multi-processor count across all visible GPUs. 1010 int max_count = -1; 1011 for (int i = 0; i < visible_gpu_order.size(); ++i) { 1012 auto exec_status = 1013 GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_order[i]); 1014 if (!exec_status.ok()) { 1015 continue; 1016 } 1017 1018 gpu::StreamExecutor* se = exec_status.ValueOrDie(); 1019 const gpu::DeviceDescription& desc = se->GetDeviceDescription(); 1020 max_count = std::max(max_count, desc.core_count()); 1021 } 1022 1023 if (max_count < 0 || kDefaultMinGPUMultiprocessorCount < max_count) { 1024 return kDefaultMinGPUMultiprocessorCount; 1025 } else { 1026 return max_count; 1027 } 1028} 1029 1030static int GetMinGPUMultiprocessorCount( 1031 gpu::Platform* gpu_manager, 1032 const std::vector<CudaGpuId>& visible_gpu_order) { 1033 const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT"); 1034 1035 if (tf_min_gpu_core_count == nullptr || 1036 strcmp(tf_min_gpu_core_count, "") == 0) { 1037 return GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order); 1038 } 1039 1040 int min_gpu_core_count = -1; 1041 if (strings::safe_strto32(tf_min_gpu_core_count, &min_gpu_core_count)) { 1042 if (min_gpu_core_count >= 0) { 1043 return min_gpu_core_count; 1044 } 1045 } 1046 1047 int count = 1048 GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order); 1049 LOG(ERROR) << "Invalid minimum GPU multiprocessor count: [" 1050 << tf_min_gpu_core_count << "]. " 1051 << "Using the default value: " << count; 1052 return count; 1053} 1054 1055namespace { 1056 1057struct CudaVersion { 1058 // Initialize from version_name in the form of "3.5" 1059 explicit CudaVersion(const std::string& version_name) { 1060 size_t dot_pos = version_name.find('.'); 1061 CHECK(dot_pos != string::npos) 1062 << "Illegal version name: [" << version_name << "]"; 1063 string major_str = version_name.substr(0, dot_pos); 1064 CHECK(strings::safe_strto32(major_str, &major_part)) 1065 << "Illegal version name: [" << version_name << "]"; 1066 string minor_str = version_name.substr(dot_pos + 1); 1067 CHECK(strings::safe_strto32(minor_str, &minor_part)) 1068 << "Illegal version name: [" << version_name << "]"; 1069 } 1070 CudaVersion() {} 1071 bool operator<(const CudaVersion& other) const { 1072 if (this->major_part != other.major_part) { 1073 return this->major_part < other.major_part; 1074 } 1075 return this->minor_part < other.minor_part; 1076 } 1077 friend std::ostream& operator<<(std::ostream& os, 1078 const CudaVersion& version) { 1079 os << version.major_part << "." << version.minor_part; 1080 return os; 1081 } 1082 int major_part = -1; 1083 int minor_part = -1; 1084}; 1085 1086std::vector<CudaVersion> supported_cuda_compute_capabilities = { 1087 TF_CUDA_CAPABILITIES,}; 1088 1089std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() { 1090 auto cuda_caps = supported_cuda_compute_capabilities; 1091#ifdef TF_EXTRA_CUDA_CAPABILITIES 1092// TF_EXTRA_CUDA_CAPABILITIES should be defined a sequence separated by commas, 1093// for example: 1094// TF_EXTRA_CUDA_CAPABILITIES=3.0,4.0,5.0 1095// Use two-level macro expansion for stringification. 1096#define TF_XSTRING(...) #__VA_ARGS__ 1097#define TF_STRING(s) TF_XSTRING(s) 1098 string extra_cuda_caps = TF_STRING(TF_EXTRA_CUDA_CAPABILITIES); 1099#undef TF_STRING 1100#undef TF_XSTRING 1101 auto extra_capabilities = str_util::Split(extra_cuda_caps, ','); 1102 for (const auto& capability : extra_capabilities) { 1103 cuda_caps.push_back(CudaVersion(capability)); 1104 } 1105#endif 1106 return cuda_caps; 1107} 1108 1109std::unique_ptr<std::map<std::pair<int, int>, bool>> GetPeerAccessMap( 1110 gpu::Platform* platform, const std::vector<CudaGpuId>& visible_gpu_order) { 1111 std::unique_ptr<std::map<std::pair<int, int>, bool>> map( 1112 new std::map<std::pair<int, int>, bool>); 1113 for (int i = 0; i < visible_gpu_order.size(); ++i) { 1114 const CudaGpuId i_gpu_id = visible_gpu_order[i]; 1115 for (int j = 0; j < visible_gpu_order.size(); ++j) { 1116 const CudaGpuId j_gpu_id = visible_gpu_order[j]; 1117 gpu::StreamExecutor* from = 1118 GpuIdUtil::ExecutorForCudaGpuId(platform, i_gpu_id).ValueOrDie(); 1119 gpu::StreamExecutor* to = 1120 GpuIdUtil::ExecutorForCudaGpuId(platform, j_gpu_id).ValueOrDie(); 1121 (*map)[{i, j}] = from->CanEnablePeerAccessTo(to); 1122 } 1123 } 1124 1125 return map; 1126} 1127 1128Status EnablePeerAccess(gpu::Platform* platform, 1129 const std::vector<CudaGpuId>& visible_gpu_order) { 1130 int possible_peer_count = 0; 1131 int enabled_peer_count = 0; 1132 for (int i = 0; i < visible_gpu_order.size(); ++i) { 1133 const CudaGpuId i_gpu_id = visible_gpu_order[i]; 1134 for (int j = 0; j < visible_gpu_order.size(); ++j) { 1135 const CudaGpuId j_gpu_id = visible_gpu_order[j]; 1136 // We have already validated that ExecutorForDevice() calls return OK. 1137 gpu::StreamExecutor* from = 1138 GpuIdUtil::ExecutorForCudaGpuId(platform, i_gpu_id).ValueOrDie(); 1139 gpu::StreamExecutor* to = 1140 GpuIdUtil::ExecutorForCudaGpuId(platform, j_gpu_id).ValueOrDie(); 1141 1142 if (from->CanEnablePeerAccessTo(to)) { 1143 ++possible_peer_count; 1144 auto status = from->EnablePeerAccessTo(to); 1145 if (!status.ok()) { 1146 LOG(WARNING) 1147 << "Unable to enable peer access between device ordinals " 1148 << i_gpu_id << " and " << j_gpu_id << ", status: " << status; 1149 } else { 1150 ++enabled_peer_count; 1151 } 1152 } 1153 } 1154 } 1155 1156 // Return an error in the extreme failure case where the driver 1157 // reported that peering was possible but not a single peering was 1158 // successful. This is to catch possible system misconfigurations 1159 // or more fundamental issues. 1160 if (possible_peer_count > 0 && enabled_peer_count == 0) { 1161 return errors::Internal(possible_peer_count, 1162 " potential peer access pairs were reported by the " 1163 "driver, but no peering could be enabled."); 1164 } 1165 return Status::OK(); 1166} 1167 1168} // namespace 1169 1170Status BaseGPUDeviceFactory::GetValidDeviceIds( 1171 const std::vector<CudaGpuId>& visible_gpu_order, 1172 std::vector<CudaGpuId>* ids) { 1173 gpu::Platform* gpu_manager = GPUMachineManager(); 1174 bool new_gpu_found = false; 1175 for (int i = 0; i < visible_gpu_order.size(); ++i) { 1176 const CudaGpuId cuda_gpu_id = visible_gpu_order[i]; 1177 1178 // Only perform this once per visible cuda gpu id. 1179 if (visible_gpu_initialized_[cuda_gpu_id.value()]) { 1180 continue; 1181 } 1182 1183 visible_gpu_initialized_[cuda_gpu_id.value()] = true; 1184 new_gpu_found = true; 1185 1186 auto executor = GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, cuda_gpu_id); 1187 if (!executor.ok()) { 1188 return StreamExecutorUtil::ConvertStatus(executor.status()); 1189 } 1190 1191 auto stream_exec = executor.ValueOrDie(); 1192 int64 free_bytes; 1193 int64 total_bytes; 1194 if (!stream_exec->DeviceMemoryUsage(&free_bytes, &total_bytes)) { 1195 // Logs internally on failure. 1196 free_bytes = 0; 1197 total_bytes = 0; 1198 } 1199 const auto& description = stream_exec->GetDeviceDescription(); 1200 int cc_major; 1201 int cc_minor; 1202 if (!description.cuda_compute_capability(&cc_major, &cc_minor)) { 1203 // Logs internally on failure. 1204 cc_major = 0; 1205 cc_minor = 0; 1206 } 1207 LOG(INFO) << "Found device " << i << " with properties: " 1208 << "\nname: " << description.name() << " major: " << cc_major 1209 << " minor: " << cc_minor 1210 << " memoryClockRate(GHz): " << description.clock_rate_ghz() 1211 << "\npciBusID: " << description.pci_bus_id() << "\ntotalMemory: " 1212 << strings::HumanReadableNumBytes(total_bytes) 1213 << " freeMemory: " << strings::HumanReadableNumBytes(free_bytes); 1214 } 1215 // Checking peering and shows matrix if more than one gpu found. 1216 if (new_gpu_found && visible_gpu_order.size() > 1) { 1217 // Enable peer access 1218 TF_RETURN_IF_ERROR(EnablePeerAccess(gpu_manager, visible_gpu_order)); 1219 1220 // Print out a matrix showing which devices can DMA to one 1221 // another. 1222 LOG(INFO) << "Device peer to peer matrix"; 1223 auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order); 1224 string line_buf = "DMA: "; 1225 for (int i = 0; i < visible_gpu_order.size(); ++i) { 1226 strings::StrAppend(&line_buf, visible_gpu_order[i].value(), " "); 1227 } 1228 LOG(INFO) << line_buf; 1229 for (int i = 0; i < visible_gpu_order.size(); ++i) { 1230 line_buf = strings::StrCat(visible_gpu_order[i].value(), ": "); 1231 for (int j = 0; j < visible_gpu_order.size(); ++j) { 1232 if ((*access_map)[{i, j}]) { 1233 line_buf.append("Y "); 1234 } else { 1235 line_buf.append("N "); 1236 } 1237 } 1238 LOG(INFO) << line_buf; 1239 } 1240 } 1241 1242 auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities(); 1243 if (cuda_supported_capabilities.empty()) { 1244 return errors::FailedPrecondition( 1245 "No supported cuda capabilities in binary."); 1246 } 1247 CudaVersion min_supported_capability = *std::min_element( 1248 cuda_supported_capabilities.begin(), cuda_supported_capabilities.end()); 1249 1250 int min_gpu_core_count = 1251 GetMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order); 1252 1253 // Filter out devices that don't have the right capability or power. 1254 for (int i = 0; i < visible_gpu_order.size(); ++i) { 1255 const CudaGpuId visible_gpu_id = visible_gpu_order[i]; 1256 auto exec_status = 1257 GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_id); 1258 if (!exec_status.ok()) { 1259 LOG(INFO) << "Ignoring visible gpu device " << visible_gpu_id 1260 << " whose executor is in invalid state: " 1261 << exec_status.status().ToString(); 1262 continue; 1263 } 1264 gpu::StreamExecutor* se = exec_status.ValueOrDie(); 1265 const gpu::DeviceDescription& desc = se->GetDeviceDescription(); 1266 CudaVersion device_capability; 1267 if (!desc.cuda_compute_capability(&device_capability.major_part, 1268 &device_capability.minor_part)) { 1269 LOG(INFO) << "Ignoring visible gpu device " 1270 << "(" << GetShortDeviceDescription(visible_gpu_id, desc) 1271 << ") " 1272 << "whose CUDA compute capability is not available."; 1273 continue; 1274 } 1275 // Only GPUs with no less than the minimum supported compute capability is 1276 // accepted. 1277 if (device_capability < min_supported_capability) { 1278 LOG(INFO) << "Ignoring visible gpu device " 1279 << "(" << GetShortDeviceDescription(visible_gpu_id, desc) 1280 << ") " 1281 << "with Cuda compute capability " << device_capability 1282 << ". The minimum required Cuda capability is " 1283 << min_supported_capability << "."; 1284 continue; 1285 } 1286 1287 // Filter out slow GPUs. By default, GPUs with a lower multiprocessor 1288 // count than the fastest GPU are filtered out, unless they have 8 or more 1289 // multiprocessors. If the TF_MIN_GPU_MULTIPROCESSOR_COUNT environment 1290 // variable is set, its value will be used to filter out GPUs. 1291 if (desc.core_count() < min_gpu_core_count) { 1292 LOG(INFO) << "Ignoring visible gpu device " 1293 << "(" << GetShortDeviceDescription(visible_gpu_id, desc) 1294 << ") " 1295 << "with Cuda multiprocessor count: " << desc.core_count() 1296 << ". The minimum required count is " << min_gpu_core_count 1297 << ". You can adjust this requirement with the env var " 1298 "TF_MIN_GPU_MULTIPROCESSOR_COUNT."; 1299 continue; 1300 } 1301 ids->push_back(visible_gpu_id); 1302 } 1303 if (!ids->empty()) { 1304 std::vector<int> raw_ids(ids->size()); 1305 std::transform(ids->begin(), ids->end(), raw_ids.begin(), 1306 [](CudaGpuId id) -> int { return id.value(); }); 1307 LOG(INFO) << "Adding visible gpu devices: " 1308 << str_util::Join(raw_ids, ", "); 1309 } 1310 1311 return Status::OK(); 1312} 1313 1314} // namespace tensorflow 1315 1316#endif // GOOGLE_CUDA 1317