gpu_device.cc revision 7149a2e2e2f549035f23e21224ee41afe8df3876
1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16// TODO(opensource): Use a more generic sounding preprocessor name than
17// GOOGLE_CUDA
18#if GOOGLE_CUDA
19
20#define EIGEN_USE_GPU
21
22#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
23
24#include <stdlib.h>
25#include <string.h>
26#include <algorithm>
27#include <list>
28#include <map>
29#include <tuple>
30#include <vector>
31
32#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
33#include "tensorflow/core/common_runtime/device_factory.h"
34#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
35#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
36#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
37#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
38#include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h"
39#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
40#include "tensorflow/core/common_runtime/gpu/process_state.h"
41#include "tensorflow/core/common_runtime/gpu_device_context.h"
42#include "tensorflow/core/common_runtime/local_device.h"
43#include "tensorflow/core/framework/allocator.h"
44#include "tensorflow/core/framework/device_base.h"
45#include "tensorflow/core/framework/op_kernel.h"
46#include "tensorflow/core/framework/tensor.h"
47#include "tensorflow/core/framework/tensor.pb.h"
48#include "tensorflow/core/framework/types.h"
49#include "tensorflow/core/framework/variant_op_registry.h"
50#include "tensorflow/core/graph/types.h"
51#include "tensorflow/core/lib/core/errors.h"
52#include "tensorflow/core/lib/core/status.h"
53#include "tensorflow/core/lib/gtl/stl_util.h"
54#include "tensorflow/core/lib/strings/numbers.h"
55#include "tensorflow/core/lib/strings/str_util.h"
56#include "tensorflow/core/lib/strings/strcat.h"
57#include "tensorflow/core/platform/cuda.h"
58#include "tensorflow/core/platform/logging.h"
59#include "tensorflow/core/platform/macros.h"
60#include "tensorflow/core/platform/stream_executor.h"
61#include "tensorflow/core/platform/tracing.h"
62#include "tensorflow/core/platform/types.h"
63#include "tensorflow/core/public/session_options.h"
64#include "tensorflow/core/util/device_name_utils.h"
65#include "tensorflow/core/util/env_var.h"
66#include "tensorflow/core/util/stream_executor_util.h"
67
68namespace tensorflow {
69
70// Eigen Ops directly allocate memory only for temporary buffers used
71// during OpKernel::Compute().  The recommended way of allocating such
72// memory is via OpKernelContext::allocate_temp().  However, Eigen Ops
73// don't have access to OpKernelContext, instead they get access to
74// memory directly through the device allocator.  As an Open Source
75// project, Eigen assumes allocator semantics similar to those of the
76// CUDA memory allocator, and may not work correctly due to race
77// conditions if used with some other allocator.  For safety, we need
78// to delay deallocation calls out of Eigen until all events on the
79// corresponding stream have completed.  The following two classes
80// serve this purpose in two different compilation environments.
81
82class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
83 public:
84  EigenCudaStreamDevice()
85      : scratch_(nullptr), semaphore_(nullptr), context_(nullptr) {
86    Eigen::initializeDeviceProp();
87  }
88  ~EigenCudaStreamDevice() override {}
89  void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
90                    TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc,
91                    char* scratch) {
92    if (LogMemory::IsEnabled()) {
93      operation_ = context->op_kernel().name() + "/EigenAllocator";
94      step_id_ = context->step_id();
95    }
96    context_ = context;
97    scratch_ = scratch;
98    semaphore_ =
99        reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize);
100    stream_ = cuda_stream;
101    allocator_ = alloc;
102    const int cuda_gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id).value();
103    device_prop_ = &Eigen::m_deviceProperties[cuda_gpu_id];
104  }
105
106  const cudaStream_t& stream() const override { return *stream_; }
107  const cudaDeviceProp& deviceProperties() const override {
108    return *device_prop_;
109  }
110
111  void* allocate(size_t num_bytes) const override {
112    void* ret = allocator_->AllocateRaw(32 /* alignment */, num_bytes);
113    if (ret == nullptr) {
114      if (context_) {
115        context_->SetStatus(errors::ResourceExhausted(
116            strings::StrCat("Ran out of GPU memory when allocating ", num_bytes,
117                            " bytes for ", operation_)));
118      } else {
119        LOG(FATAL)
120            << "EigenAllocator for GPU ran out of memory when allocating "
121            << num_bytes << ". See error logs for more detailed info.";
122      }
123    }
124    if (LogMemory::IsEnabled() && ret != nullptr) {
125      LogMemory::RecordRawAllocation(operation_, step_id_, num_bytes, ret,
126                                     allocator_);
127    }
128    return ret;
129  }
130  void deallocate(void* buffer) const override {
131    if (LogMemory::IsEnabled() && buffer != nullptr) {
132      LogMemory::RecordRawDeallocation(operation_, step_id_, buffer, allocator_,
133                                       true);
134    }
135    AsyncFreeData* afData =
136        new AsyncFreeData(allocator_, buffer, operation_, step_id_);
137    cudaError_t err = cudaStreamAddCallback(*stream_, asyncFree, afData, 0);
138    CHECK_EQ(err, cudaSuccess);
139  }
140
141  // Return a pointer to a per stream scratchpad of 1024 bytes residing
142  // in global memory.
143  void* scratchpad() const override { return scratch_; }
144
145  // Return a semaphore. The semaphore is initially initialized to 0, and
146  // each kernel using it is responsible for resetting to 0 upon completion
147  // to maintain the invariant that the semaphore is always equal to 0 upon
148  // each kernel start.
149  unsigned int* semaphore() const override { return semaphore_; }
150
151 private:
152  struct AsyncFreeData {
153    AsyncFreeData(::tensorflow::Allocator* a, void* p, const string& o,
154                  const int64 s)
155        : allocator_(a), address_(p), operation_(o), step_id_(s) {}
156    ::tensorflow::Allocator* allocator_;
157    void* address_;
158    const string operation_;
159    const int64 step_id_;
160  };
161
162  static void CUDART_CB asyncFree(cudaStream_t stream, cudaError_t status,
163                                  void* userData) {
164    AsyncFreeData* data = static_cast<AsyncFreeData*>(userData);
165    if (LogMemory::IsEnabled()) {
166      LogMemory::RecordRawDeallocation(data->operation_, data->step_id_,
167                                       data->address_, data->allocator_, false);
168    }
169    data->allocator_->DeallocateRaw(data->address_);
170    delete data;
171  }
172
173  string operation_;
174  int64 step_id_;
175  const cudaStream_t* stream_;          // Not owned.
176  const cudaDeviceProp* device_prop_;   // Not owned.
177  ::tensorflow::Allocator* allocator_;  // Not owned.
178  mutable char* scratch_;
179  mutable unsigned int* semaphore_;
180  OpKernelContext* context_;
181
182  TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice);
183};
184
185// This factory helps to ensure that different GPU device objects that refer to
186// the same physical device and stream group id use the same stream group
187// object (and therefore the same CUDA streams). This is necessary since there
188// is a single memory allocator per device (see ProcessState::GetGPUAllocator)
189// and allocators must not be shared across streams.
190class BaseGPUDevice::StreamGroupFactory {
191 public:
192  // Returns the unique stream group for use with the stream defined by
193  // {tf_gpu_id, stream_group_within_gpu}, creating it if it does not yet
194  // exist.
195  // This function is thread safe.
196  BaseGPUDevice::StreamGroup* GetOrCreate(TfGpuId tf_gpu_id,
197                                          int stream_group_within_gpu,
198                                          gpu::StreamExecutor* executor) {
199    mutex_lock guard(lock_);
200    StreamGroup* group =
201        &streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)];
202    if (!group->compute) {
203      group->compute = new gpu::Stream(executor);
204      group->compute->Init();
205      VLOG(2) << "Created stream[" << stream_group_within_gpu
206              << "] = " << group->compute;
207
208      group->host_to_device = new gpu::Stream(executor);
209      group->host_to_device->Init();
210      VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu
211              << "] = " << group->host_to_device;
212
213      group->device_to_host = new gpu::Stream(executor);
214      group->device_to_host->Init();
215      VLOG(2) << "Created device_to_host_stream[" << stream_group_within_gpu
216              << "] = " << group->device_to_host;
217
218      group->device_to_device = new gpu::Stream(executor);
219      group->device_to_device->Init();
220      VLOG(2) << "Created device_to_device_stream[" << stream_group_within_gpu
221              << "] = " << group->device_to_host;
222    }
223    return group;
224  }
225
226  // Returns a reference to the StreamGroupFactory singleton. Note that this is
227  // never destroyed, so the objects it owns are never deleted.
228  static StreamGroupFactory& Global() {
229    static StreamGroupFactory* instance = new StreamGroupFactory();
230    return *instance;
231  }
232
233 private:
234  mutex lock_;
235  using key_type = std::tuple<int, int>;
236  std::map<key_type, StreamGroup> streams_;
237
238  // StreamGroupFactory cannot be created directly; Call
239  // StreamGroupFactory::Global() to get the global instance.
240  StreamGroupFactory() = default;
241  TF_DISALLOW_COPY_AND_ASSIGN(StreamGroupFactory);
242};
243
244BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
245                             Bytes memory_limit, const DeviceLocality& locality,
246                             TfGpuId tf_gpu_id,
247                             const string& physical_device_desc,
248                             Allocator* gpu_allocator, Allocator* cpu_allocator,
249                             bool sync_every_op, int32 max_streams)
250    : LocalDevice(options, Device::BuildDeviceAttributes(name, DEVICE_GPU,
251                                                         memory_limit, locality,
252                                                         physical_device_desc)),
253      gpu_allocator_(gpu_allocator),
254      cpu_allocator_(cpu_allocator),
255      tf_gpu_id_(tf_gpu_id),
256      sync_every_op_(sync_every_op),
257      max_streams_(max_streams) {
258  ProcessState::singleton()->EnableGPUDevice();
259}
260
261BaseGPUDevice::~BaseGPUDevice() {
262  delete gpu_device_info_;
263  for (auto ctx : device_contexts_) ctx->Unref();
264}
265
266Status BaseGPUDevice::Init(const SessionOptions& options) {
267  auto executor_status = GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id_);
268  if (!executor_status.status().ok()) {
269    return errors::Internal("Failed to get StreamExecutor for device ",
270                            tf_gpu_id_.value());
271  }
272
273  executor_ = executor_status.ValueOrDie();
274  em_.reset(new EventMgr(executor_, options.config.gpu_options()));
275
276  if (max_streams_ < 1) {
277    return errors::InvalidArgument("Invalid value for max_streams.");
278  }
279
280  // Create the specified number of GPU streams
281  for (int i = 0; i < max_streams_; i++) {
282    streams_.push_back(
283        StreamGroupFactory::Global().GetOrCreate(tf_gpu_id_, i, executor_));
284
285    size_t scratch_buffer_size = Eigen::kCudaScratchSize + sizeof(unsigned int);
286    void* scratch_buffer = gpu_allocator_->AllocateRaw(
287        Allocator::kAllocatorAlignment, scratch_buffer_size);
288    if (scratch_buffer == nullptr) {
289      return errors::FailedPrecondition(
290          "Failed to allocate scratch buffer for device ", tf_gpu_id_.value());
291    }
292    scratch_.push_back(static_cast<char*>(scratch_buffer));
293
294    perftools::gputools::DeviceMemory<char> mem(
295        perftools::gputools::DeviceMemoryBase(scratch_buffer,
296                                              scratch_buffer_size));
297
298    bool ok = executor_->SynchronousMemZero(
299        &mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
300    if (!ok) {
301      return errors::FailedPrecondition(
302          "Failed to memcopy into scratch buffer for device ",
303          tf_gpu_id_.value());
304    }
305
306    device_contexts_.push_back(new GPUDeviceContext(
307        i, streams_.back()->compute, streams_.back()->host_to_device,
308        streams_.back()->device_to_host, streams_.back()->device_to_device));
309  }
310  gpu_device_info_ = new GpuDeviceInfo;
311  gpu_device_info_->stream = streams_[0]->compute;
312  gpu_device_info_->default_context = device_contexts_[0];
313  gpu_device_info_->event_mgr = em_.get();
314  gpu_device_info_->gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id_).value();
315  set_tensorflow_gpu_device_info(gpu_device_info_);
316
317  // Whether and how the GPU device uses its own threadpool.
318  // This option is experimental. Once we confirm the best setting, we
319  // may change the default behavior and completely remove this flag.
320  // Default values might change in future releases.
321  // Possible values:
322  //   * global: GPU uses threads shared with CPU in the main compute
323  //          thread-pool. This is currently the default.
324  //   * gpu_private: GPU uses threads dedicated to this device.
325  //   * gpu_shared: All GPUs share a dedicated thread pool.
326  string gpu_thread_mode;
327  TF_RETURN_IF_ERROR(
328      ReadStringFromEnvVar("TF_GPU_THREAD_MODE", "global", &gpu_thread_mode));
329  gpu_thread_mode = str_util::Lowercase(gpu_thread_mode);
330  if (gpu_thread_mode != "global") {
331    int64 gpu_thread_count = -1;
332    // Default to two threads. One for device compute and another for memory
333    // copies.
334    TF_RETURN_IF_ERROR(
335        ReadInt64FromEnvVar("TF_GPU_THREAD_COUNT", 2, &gpu_thread_count));
336    if (gpu_thread_mode == "gpu_private") {
337      // TODO(zhengxq): since these threads only serve a single GPU device,
338      //   we should set the device context once for each thread, and avoid
339      //   setting them for each kernel.
340      // TODO(zhengxq): pin the thread to the same socket of the target GPU.
341      thread_pool_.reset(new thread::ThreadPool(
342          options.env, strings::StrCat("gpu_private_", tf_gpu_id_.value()),
343          static_cast<int32>(gpu_thread_count)));
344      set_tensorflow_device_thread_pool(thread_pool_.get());
345    } else if (gpu_thread_mode == "gpu_shared") {
346      static thread::ThreadPool* thread_pool = new thread::ThreadPool(
347          options.env, "gpu_shared", static_cast<int32>(gpu_thread_count));
348      set_tensorflow_device_thread_pool(thread_pool);
349    } else {
350      string error_message =
351          strings::StrCat("Invalid gpu_thread_mode: ", gpu_thread_mode);
352      LOG(WARNING) << error_message;
353      return errors::InvalidArgument(error_message);
354    }
355  }
356
357  return Status::OK();
358}
359
360bool BaseGPUDevice::RequiresRecordingAccessedTensors() const {
361  // When there is no more than one stream, we release the tensor reference
362  // at the end of the kernel launch, instead of at the end of the kernel
363  // execution.
364  return streams_.size() > 1;
365}
366
367Status BaseGPUDevice::FillContextMap(const Graph* graph,
368                                     DeviceContextMap* device_context_map) {
369  VLOG(2) << "FillContextMap";
370
371  const size_t num_streams = streams_.size();
372  // Special case for single stream.
373  if (num_streams == 1) {
374    return Status::OK();
375  }
376  const int64 before = Env::Default()->NowMicros();
377  gpu_stream_util::AssignStreamsOpts opts;
378  opts.max_streams = static_cast<int32>(num_streams);
379  std::unordered_map<int, int> node_to_stream_id;
380  TF_RETURN_IF_ERROR(
381      gpu_stream_util::AssignStreams(graph, opts, &node_to_stream_id));
382  int64 elapsed = Env::Default()->NowMicros() - before;
383  VLOG(3) << "AssignStreams took " << elapsed << "us";
384
385  // Fill in the context map.  It is OK for this map to contain
386  // duplicate DeviceContexts so long as we increment the refcount.
387  device_context_map->resize(graph->num_node_ids());
388  for (Node* n : graph->nodes()) {
389    auto mapped_stream = node_to_stream_id[n->id()];
390    CHECK_LE(mapped_stream, num_streams);
391    auto ctx = device_contexts_[mapped_stream];
392    VLOG(3) << "Assigned stream " << node_to_stream_id[n->id()]
393            << " ==> stream[" << ctx->stream_id() << "] for node id " << n->id()
394            << " " << n->type_string() << " " << n->name();
395    ctx->Ref();
396    (*device_context_map)[n->id()] = ctx;
397  }
398
399  return Status::OK();
400}
401
402void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
403  // ScopedActivity is cheap when tracing is not active, but we
404  // can avoid computing the Hash64.
405  // TODO(pbar) This would no longer be needed if Ops have a unique id.
406  const uint64 id = port::Tracing::IsActive() ? Hash64(op_kernel->name()) : 0;
407  port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute,
408                                       id);
409
410  // NOTE(tucker): We need to discriminate between Eigen GPU
411  // operations and all others.  If an operation is Eigen
412  // implemented (or otherwise tries to launch a cuda kernel
413  // directly), we need to establish a stacked-scoped environment
414  // that directs it to execute on the proper device.  Otherwise we
415  // expect the Op to use StreamExecutor directly and correctly.  The
416  // way we make this discrimination is quite hacky: At the moment
417  // the only non-Eigen GPU Op is the recv-op, which is known to be
418  // asynchronous.
419  if (op_kernel->is_internal() && op_kernel->type_string() == "_Recv") {
420    context->SetStatus(errors::Internal(
421        "Invalid synchronous 'Compute' on GPU for '_Recv' op"));
422  } else if (port::Tracing::ScopedAnnotation::Enabled()) {
423    port::Tracing::ScopedAnnotation annotation(op_kernel->name(),
424                                               op_kernel->type_string());
425    ComputeHelper(op_kernel, context);
426  } else {
427    ComputeHelper(op_kernel, context);
428  }
429}
430
431void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
432                                  OpKernelContext* context) {
433  GPUDeviceContext* gpu_device_context = device_contexts_[0];
434  if (context->op_device_context() != nullptr) {
435    gpu_device_context =
436        static_cast<GPUDeviceContext*>(context->op_device_context());
437  }
438  gpu::Stream* stream = gpu_device_context->stream();
439  const auto stream_id = gpu_device_context->stream_id();
440
441  const bool vlog_1 = VLOG_IS_ON(1);
442  const bool vlog_2 = vlog_1 && VLOG_IS_ON(2);
443
444  if (vlog_1) {
445    VLOG(1) << "GpuDevice::Compute " << op_kernel->name() << " op "
446            << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
447            << stream_id << "]";
448  }
449
450  const auto num_streams = streams_.size();
451  if (num_streams > 1) {
452    // If this op's device context is different from the other contexts,
453    // we must wait on the stream.
454    for (int i = 0; i < context->num_inputs(); ++i) {
455      const GPUDeviceContext* idc =
456          static_cast<GPUDeviceContext*>(context->input_device_context(i));
457      OP_REQUIRES(context, idc != nullptr,
458                  errors::Internal("Input device context ", i,
459                                   " was not set properly."));
460      if (vlog_2) {
461        const void* base;
462        size_t len;
463        if (context->has_input(i)) {
464          if (IsRefType(context->input_dtype(i))) {
465            Tensor tensor = context->mutable_input(i, false);
466            base = DMAHelper::base(&tensor);
467            len = tensor.TotalBytes();
468          } else {
469            const Tensor& tensor = context->input(i);
470            base = DMAHelper::base(&tensor);
471            len = tensor.TotalBytes();
472          }
473          LOG(INFO) << "Input " << i << " " << base << "  " << len;
474          LOG(INFO) << "  stream[" << stream_id << "].ThenWaitFor(stream["
475                    << idc->stream_id() << "])"
476                    << ((idc->stream() == stream) ? " not needed" : "");
477        }
478      }
479      if (idc->stream() != stream) stream->ThenWaitFor(idc->stream());
480    }
481  }
482  gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
483  op_kernel->Compute(context);
484  if (context->status().ok()) {
485    if (sync_every_op_) {
486      // Note: GPUUtil::Sync() only syncs the default stream.
487      // We need to either sync the stream used by this op, or
488      // all streams.  Given that this flag is typically used for
489      // debugging it makes more sense to sync all GPU activity.
490      context->SetStatus(GPUUtil::SyncAll(this));
491    }
492  }
493}
494
495void BaseGPUDevice::ConsumeListOfAccessedTensors(
496    DeviceContext* device_context, const TensorReferenceVector& tensor_refs) {
497  GPUDeviceContext* gpu_device_context = device_contexts_[0];
498  if (device_context != nullptr) {
499    gpu_device_context = static_cast<GPUDeviceContext*>(device_context);
500  }
501  gpu::Stream* stream = gpu_device_context->stream();
502  em_->ThenDeleteTensors(stream, tensor_refs);
503}
504
505// Based on the semantics of Device::Sync this call should wait for
506// all streams not just the current one.
507Status BaseGPUDevice::Sync() { return GPUUtil::SyncAll(this); }
508
509void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
510                                 OpKernelContext* context,
511                                 AsyncOpKernel::DoneCallback done) {
512  GPUDeviceContext* gpu_device_context = device_contexts_[0];
513  if (context->op_device_context() != nullptr) {
514    gpu_device_context =
515        static_cast<GPUDeviceContext*>(context->op_device_context());
516  }
517  gpu::Stream* stream = gpu_device_context->stream();
518  const auto stream_id = gpu_device_context->stream_id();
519
520  VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op "
521          << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
522          << stream_id << "]";
523
524  // When TraceMe profiling is off (which is the default), the
525  // following TraceMe constructor is simply a conditional test of
526  // false value. Measurements show that its overhead is negligible.
527  port::Tracing::TraceMe activity(op_kernel->name(), op_kernel->type_string(),
528                                  op_kernel->IsExpensive());
529  gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
530  op_kernel->ComputeAsync(context, done);
531}
532
533Status BaseGPUDevice::MaybeCopyTensorToGPU(
534    const AllocatorAttributes& alloc_attrs, const Tensor& from, Tensor* to,
535    StatusCallback done) {
536  if (alloc_attrs.on_host()) {
537    *to = from;
538    done(Status::OK());
539    return Status::OK();
540  } else {
541    if (!DMAHelper::CanUseDMA(&from)) {
542      Status err = errors::Internal("GPU copy from non-DMA ",
543                                    DataTypeString(from.dtype()), " tensor");
544      done(err);
545      return err;
546    }
547    auto* copy =
548        new Tensor(GetAllocator(alloc_attrs), from.dtype(), from.shape());
549
550    // If the tensor is not initialized, we likely ran out of memory.
551    if (!copy->IsInitialized()) {
552      delete copy;
553      Status err = errors::ResourceExhausted(
554          "OOM when allocating tensor of shape ", from.shape().DebugString(),
555          " and type ", DataTypeString(from.dtype()));
556      done(err);
557      return err;
558    }
559
560    StatusCallback wrapped_done = std::bind(
561        [to, copy](StatusCallback done_,
562                   // Begin unbound arguments.
563                   const Status& s) {
564          *to = std::move(*copy);
565          delete copy;
566          done_(s);
567        },
568        std::move(done), std::placeholders::_1);
569
570    port::Tracing::ScopedAnnotation annotation("MakeTensorFromProto");
571    device_contexts_[0]->CopyCPUTensorToDevice(&from, this, copy,
572                                               std::move(wrapped_done));
573    return Status::OK();
574  }
575}
576
577Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
578                                          const AllocatorAttributes alloc_attrs,
579                                          Tensor* tensor) {
580  AllocatorAttributes attr;
581  attr.set_on_host(true);
582  attr.set_gpu_compatible(true);
583  Allocator* host_alloc = GetAllocator(attr);
584  Tensor parsed(tensor_proto.dtype());
585  if (!parsed.FromProto(host_alloc, tensor_proto)) {
586    return errors::InvalidArgument("Cannot parse tensor from proto: ",
587                                   tensor_proto.DebugString());
588  }
589
590  if (parsed.dtype() == DT_VARIANT) {
591    const Variant* from = parsed.flat<Variant>().data();
592    Tensor copy(cpu_allocator(), DT_VARIANT, parsed.shape());
593    Variant* copy_variant = copy.flat<Variant>().data();
594
595    std::list<Notification> notifications;
596    Status copy_status;
597    auto copier = [this, &alloc_attrs, &notifications, &copy_status](
598                      const Tensor& from, Tensor* to) {
599      // Copier isn't run in a multithreaded environment, so we don't
600      // have to worry about the notifications list being modified in parallel.
601      notifications.emplace_back();
602      Notification& n = *notifications.rbegin();
603      return MaybeCopyTensorToGPU(alloc_attrs, from, to,
604                                  [&n, &copy_status](const Status& s) {
605                                    if (copy_status.ok()) {
606                                      copy_status.Update(s);
607                                    }
608                                    n.Notify();
609                                  });
610    };
611    Status s;
612    for (int64 ix = 0; ix < parsed.NumElements(); ++ix) {
613      s = VariantDeviceCopy(VariantDeviceCopyDirection::HOST_TO_DEVICE,
614                            from[ix], &copy_variant[ix], copier);
615      if (!s.ok()) {
616        break;
617      }
618    }
619    for (auto& n : notifications) {
620      n.WaitForNotification();
621    }
622    if (!s.ok()) {
623      return s;
624    }
625    *tensor = std::move(copy);
626    return copy_status;
627  } else {
628    Notification n;
629    Status status;
630    TF_RETURN_IF_ERROR(MaybeCopyTensorToGPU(alloc_attrs, parsed, tensor,
631                                            [&n, &status](const Status& s) {
632                                              status = s;
633                                              n.Notify();
634                                            }));
635    n.WaitForNotification();
636    return status;
637  }
638}
639
640namespace {
641class ConcretePerOpGpuDevice : public PerOpGpuDevice {
642 public:
643  ConcretePerOpGpuDevice() : device_(&stream_device_) {}
644
645  void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
646                    TfGpuId tf_gpu_id, Allocator* base_allocator,
647                    char* scratch) {
648    stream_device_.Reinitialize(context, cuda_stream, tf_gpu_id, base_allocator,
649                                scratch);
650  }
651
652  const Eigen::GpuDevice& device() const override { return device_; }
653
654 private:
655  EigenCudaStreamDevice stream_device_;
656  Eigen::GpuDevice device_;
657};
658
659// Parse 'visible_device_list' into a list of CUDA GPU ids.
660Status ParseVisibleDeviceList(const string& visible_device_list,
661                              std::vector<CudaGpuId>* visible_gpu_order) {
662  visible_gpu_order->clear();
663  gpu::Platform* gpu_manager = GPUMachineManager();
664
665  // If the user wants to remap the visible to virtual GPU mapping,
666  // check for that here.
667  if (visible_device_list.empty()) {
668    visible_gpu_order->resize(gpu_manager->VisibleDeviceCount());
669    // By default, visible to virtual mapping is unchanged.
670    int deviceNo = 0;
671    std::generate(visible_gpu_order->begin(), visible_gpu_order->end(),
672                  [&deviceNo] { return deviceNo++; });
673  } else {
674    const std::vector<string> order_str =
675        str_util::Split(visible_device_list, ',');
676    for (const string& cuda_gpu_id_str : order_str) {
677      int32 cuda_gpu_id;
678      if (!strings::safe_strto32(cuda_gpu_id_str, &cuda_gpu_id)) {
679        return errors::InvalidArgument(
680            "Could not parse entry in 'visible_device_list': '",
681            cuda_gpu_id_str, "'. visible_device_list = ", visible_device_list);
682      }
683      if (cuda_gpu_id < 0 || cuda_gpu_id >= gpu_manager->VisibleDeviceCount()) {
684        return errors::InvalidArgument(
685            "'visible_device_list' listed an invalid GPU id '", cuda_gpu_id,
686            "' but visible device count is ",
687            gpu_manager->VisibleDeviceCount());
688      }
689      visible_gpu_order->push_back(CudaGpuId(cuda_gpu_id));
690    }
691  }
692
693  // Validate no repeats.
694  std::set<CudaGpuId> visible_device_set(visible_gpu_order->begin(),
695                                         visible_gpu_order->end());
696  if (visible_device_set.size() != visible_gpu_order->size()) {
697    return errors::InvalidArgument(
698        "visible_device_list contained a duplicate entry: ",
699        visible_device_list);
700  }
701  return Status::OK();
702}
703
704Status VerifyVirtualDeviceSettings(
705    const size_t num_gpus_to_use, const GPUOptions& gpu_options,
706    const std::vector<CudaGpuId>& visible_gpu_order,
707    const std::vector<CudaGpuId>& valid_cuda_gpu_ids) {
708  const auto& virtual_devices = gpu_options.experimental().virtual_devices();
709  CHECK(!virtual_devices.empty());
710  if (gpu_options.per_process_gpu_memory_fraction() > 0) {
711    return errors::InvalidArgument(
712        "It's invalid to set per_process_gpu_memory_fraction when "
713        "virtual_devices is set.");
714  }
715  if (num_gpus_to_use < virtual_devices.size()) {
716    return errors::Unknown(
717        "Not enough GPUs to create virtual devices."
718        " num_gpus_to_use: ",
719        num_gpus_to_use, " #virtual_devices: ", virtual_devices.size());
720  }
721  if (!gpu_options.visible_device_list().empty() &&
722      visible_gpu_order.size() != virtual_devices.size()) {
723    return errors::InvalidArgument(
724        "The number of GPUs in visible_device_list doesn't match the number "
725        "of elements in the virtual_devices list.",
726        " #GPUs in visible_device_list: ", visible_gpu_order.size(),
727        " virtual_devices.size(): ", virtual_devices.size());
728  }
729  if (valid_cuda_gpu_ids.size() != virtual_devices.size()) {
730    return errors::Unknown(
731        "The number of valid GPUs doesn't match the number of elements in "
732        "the virtual_devices list.",
733        " #valid GPUs: ", valid_cuda_gpu_ids.size(),
734        " virtual_devices.size(): ", virtual_devices.size());
735  }
736  return Status::OK();
737}
738
739int64 MinSystemMemory(int64 available_memory) {
740  // We use the following heuristic for now:
741  //
742  // If the available_memory is < 2GiB, we allocate 225MiB to system memory.
743  // Otherwise, allocate max(300MiB, 0.05 * available_memory) to system memory.
744  //
745  // In the future we could be more sophisticated by using a table of devices.
746  int64 min_system_memory;
747  if (available_memory < (1LL << 31)) {
748    // 225MiB
749    min_system_memory = 225 * 1024 * 1024;
750  } else {
751    // max(300 MiB, 0.05 * available_memory)
752    min_system_memory =
753        std::max(314572800LL, static_cast<int64>(available_memory * 0.05));
754  }
755#if defined(__GNUC__) && defined(__OPTIMIZE__)
756// Do nothing
757#elif !defined(__GNUC__) && defined(NDEBUG)
758// Do nothing
759#else
760  // Double the amount of available GPU memory in non-opt builds (debug
761  // builds in windows); because in non-opt builds more system memory
762  // is necessary.
763  min_system_memory *= 2;
764#endif
765#if defined(NVIDIA_TEGRA)
766  // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM
767  // and Video RAM
768  min_system_memory = 1 << 30;
769#endif
770  return min_system_memory;
771}
772
773// Get the memory limit for the virtual device being created on GPU with
774// 'cuda_gpu_id', when that virtual device is the only virtual device being
775// created on that GPU.
776Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options,
777                                      CudaGpuId cuda_gpu_id,
778                                      int64* memory_limit) {
779  int64 total_memory = 0;
780  int64 available_memory = 0;
781  gpu::StreamExecutor* se =
782      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
783  if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
784    return errors::Unknown("Failed to query available memory for GPU ",
785                           cuda_gpu_id.value());
786  }
787
788  int64 allocated_memory = 0;
789  const double per_process_gpu_memory_fraction =
790      gpu_options.per_process_gpu_memory_fraction();
791  if (per_process_gpu_memory_fraction == 0) {
792    allocated_memory = available_memory;
793    const int64 min_system_memory = MinSystemMemory(available_memory);
794    if (min_system_memory < allocated_memory) {
795      allocated_memory -= min_system_memory;
796    }
797  } else {
798    allocated_memory = total_memory * per_process_gpu_memory_fraction;
799  }
800  *memory_limit = allocated_memory;
801  return Status::OK();
802}
803}  // namespace
804
805void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
806                                       PerOpGpuDevice* device, int stream_id,
807                                       Allocator* allocator) {
808  ConcretePerOpGpuDevice* concrete_device =
809      static_cast<ConcretePerOpGpuDevice*>(device);
810  DCHECK(concrete_device);
811  const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>(
812      streams_[stream_id]->compute->implementation()->CudaStreamMemberHack());
813  concrete_device->Reinitialize(context, cuda_stream, tf_gpu_id_, allocator,
814                                scratch_[stream_id]);
815}
816
817PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice() {
818  return new ConcretePerOpGpuDevice();
819}
820
821void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
822                                          PerOpGpuDevice* device,
823                                          DeviceContext* dc,
824                                          Allocator* allocator) {
825  if (dc) {
826    const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc);
827    const int stream_id = gpu_dc->stream_id();
828    VLOG(1) << "  eigen_gpu_device(" << dc << ") => stream[" << stream_id
829            << "]";
830    CHECK_LT(stream_id, streams_.size());
831    ReinitializeDevice(context, device, stream_id, allocator);
832  } else {
833    ReinitializeDevice(context, device, 0, allocator);
834  }
835}
836
837Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
838                                           const string& name_prefix,
839                                           std::vector<Device*>* devices) {
840  TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
841  gpu::Platform* gpu_manager = GPUMachineManager();
842  if (gpu_manager == nullptr) {
843    return Status::OK();
844  }
845  // If there are no GPUs visible, do nothing.
846  if (gpu_manager->VisibleDeviceCount() <= 0) {
847    return Status::OK();
848  }
849
850  size_t num_gpus_to_use = INT_MAX;
851  auto iter = options.config.device_count().find("GPU");
852  if (iter != options.config.device_count().end()) {
853    num_gpus_to_use = iter->second;
854  }
855  const auto& gpu_options = options.config.gpu_options();
856  std::vector<CudaGpuId> visible_gpu_order;
857  TF_RETURN_IF_ERROR(ParseVisibleDeviceList(gpu_options.visible_device_list(),
858                                            &visible_gpu_order));
859
860  std::vector<CudaGpuId> valid_cuda_gpu_ids;
861  TF_RETURN_IF_ERROR(GetValidDeviceIds(visible_gpu_order, &valid_cuda_gpu_ids));
862  if (num_gpus_to_use > valid_cuda_gpu_ids.size()) {
863    num_gpus_to_use = valid_cuda_gpu_ids.size();
864  }
865  if (!valid_cuda_gpu_ids.empty()) {
866    // Save the original device.
867    int original_device = 0;
868    cudaError_t err = cudaGetDevice(&original_device);
869    if (err != cudaSuccess) {
870      return errors::Internal("cudaGetDevice() failed. Status: ",
871                              cudaGetErrorString(err));
872    }
873    // Force to implicitly initialize CUDA runtime on each valid GPU before
874    // CreateGPUDevice().
875    for (CudaGpuId cuda_gpu_id : valid_cuda_gpu_ids) {
876      err = cudaSetDevice(cuda_gpu_id.value());
877      if (err != cudaSuccess) {
878        return errors::Internal("cudaSetDevice() on GPU:", cuda_gpu_id.value(),
879                                " failed. Status: ", cudaGetErrorString(err));
880      }
881      err = cudaFree(nullptr);
882      if (err != cudaSuccess) {
883        return errors::Internal(
884            "CUDA runtime implicit initialization on GPU:", cuda_gpu_id.value(),
885            " failed. Status: ", cudaGetErrorString(err));
886      }
887    }
888    // Reset to the original device.
889    err = cudaSetDevice(original_device);
890    if (err != cudaSuccess) {
891      return errors::Internal("cudaSetDevice() on GPU:", original_device,
892                              " failed. Status: ", cudaGetErrorString(err));
893    }
894  }
895
896  const auto& virtual_devices = gpu_options.experimental().virtual_devices();
897  if (!virtual_devices.empty()) {
898    TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(
899        num_gpus_to_use, gpu_options, visible_gpu_order, valid_cuda_gpu_ids));
900    // We've verified that num_gpus_to_use >= virtual_devices.size().
901    num_gpus_to_use = virtual_devices.size();
902    CHECK(gpu_options.visible_device_list().empty() ||
903          valid_cuda_gpu_ids == visible_gpu_order);
904  }
905  int next_tf_gpu_id = 0;
906  for (int i = 0; i < num_gpus_to_use; ++i) {
907    const CudaGpuId cuda_gpu_id = valid_cuda_gpu_ids[i];
908    std::vector<int64> memory_limit_bytes;
909    if (virtual_devices.empty() ||
910        virtual_devices.Get(i).memory_limit_mb_size() == 0) {
911      int64 single_virtual_device_memory_limit = 0;
912      TF_RETURN_IF_ERROR(SingleVirtualDeviceMemoryLimit(
913          gpu_options, cuda_gpu_id, &single_virtual_device_memory_limit));
914      memory_limit_bytes.push_back(single_virtual_device_memory_limit);
915    } else {
916      const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb();
917      std::transform(memory_limit_mb.begin(), memory_limit_mb.end(),
918                     std::back_inserter(memory_limit_bytes), [](float mb) {
919                       return static_cast<int64>(mb) * (1ll << 20);
920                     });
921    }
922    for (int64 bytes : memory_limit_bytes) {
923      TfGpuId tf_gpu_id(next_tf_gpu_id);
924      ++next_tf_gpu_id;
925      GpuIdUtil::InsertTfCudaGpuIdPair(tf_gpu_id, cuda_gpu_id);
926      TF_RETURN_IF_ERROR(
927          CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes, devices));
928    }
929  }
930  return Status::OK();
931}
932
933static string GetShortDeviceDescription(CudaGpuId cuda_gpu_id,
934                                        const gpu::DeviceDescription& desc) {
935  int cc_major;
936  int cc_minor;
937  if (!desc.cuda_compute_capability(&cc_major, &cc_minor)) {
938    cc_major = 0;
939    cc_minor = 0;
940  }
941  // LINT.IfChange
942  return strings::StrCat("device: ", cuda_gpu_id.value(),
943                         ", name: ", desc.name(),
944                         ", pci bus id: ", desc.pci_bus_id(),
945                         ", compute capability: ", cc_major, ".", cc_minor);
946  // LINT.ThenChange(//tensorflow/python/platform/test.py)
947}
948
949Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
950                                             const string& name_prefix,
951                                             TfGpuId tf_gpu_id,
952                                             int64 memory_limit,
953                                             std::vector<Device*>* devices) {
954  CHECK_GE(tf_gpu_id.value(), 0);
955  const string device_name =
956      strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value());
957
958  // Look up the device, to see its attributes.
959  GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
960  gpu::StreamExecutor* se =
961      GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie();
962  const gpu::DeviceDescription& desc = se->GetDeviceDescription();
963  int numa_node = desc.numa_node();
964  if (numa_node < 0) {
965    // For some reason the StreamExecutor couldn't get the NUMA
966    // affinity of the GPU.  If this is not a multi-socket mobo with
967    // GPUs local to different buses, it doesn't matter.  If it is, we
968    // may run into trouble later with data transfer operations.  The
969    // trouble may manifest as slower than expected performance, or
970    // outright failures.
971    LOG(INFO) << "Could not identify NUMA node of " << device_name
972              << ", defaulting to 0.  Your kernel may not have been built "
973              << "with NUMA support.";
974    numa_node = 0;
975  }
976  Bytes allocated_bytes = static_cast<Bytes>(memory_limit);
977
978  // Get GPU bus_id from its reported NUMA affinity.  Because GPUs are
979  // virtualized in some environments, we can't just use the GPU id.
980  // NUMA locales are indexed from 0, buses are indexed from 1.
981  DeviceLocality dev_locality;
982  dev_locality.set_bus_id(numa_node + 1);
983  const CudaGpuId cuda_gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id);
984  VLOG(1) << "GPUDevice id " << cuda_gpu_id << " on bus "
985          << dev_locality.bus_id() << " numa: " << numa_node
986          << " pci: " << desc.pci_bus_id();
987
988  LOG(INFO) << "Creating TensorFlow device (" << device_name << " with "
989            << (memory_limit >> 20) << " MB memory) -> physical GPU ("
990            << GetShortDeviceDescription(cuda_gpu_id, desc) << ")";
991  ProcessState* process_state = ProcessState::singleton();
992  BaseGPUDevice* gpu_device = CreateGPUDevice(
993      options, device_name, allocated_bytes, dev_locality, tf_gpu_id,
994      GetShortDeviceDescription(cuda_gpu_id, desc),
995      process_state->GetGPUAllocator(options.config.gpu_options(), tf_gpu_id,
996                                     memory_limit),
997      process_state->GetCPUAllocator(numa_node));
998  TF_RETURN_IF_ERROR(gpu_device->Init(options));
999  devices->push_back(gpu_device);
1000
1001  return Status::OK();
1002}
1003
1004static int GetDefaultMinGPUMultiprocessorCount(
1005    gpu::Platform* gpu_manager,
1006    const std::vector<CudaGpuId>& visible_gpu_order) {
1007  static const int kDefaultMinGPUMultiprocessorCount = 8;
1008
1009  // Find the highest multi-processor count across all visible GPUs.
1010  int max_count = -1;
1011  for (int i = 0; i < visible_gpu_order.size(); ++i) {
1012    auto exec_status =
1013        GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_order[i]);
1014    if (!exec_status.ok()) {
1015      continue;
1016    }
1017
1018    gpu::StreamExecutor* se = exec_status.ValueOrDie();
1019    const gpu::DeviceDescription& desc = se->GetDeviceDescription();
1020    max_count = std::max(max_count, desc.core_count());
1021  }
1022
1023  if (max_count < 0 || kDefaultMinGPUMultiprocessorCount < max_count) {
1024    return kDefaultMinGPUMultiprocessorCount;
1025  } else {
1026    return max_count;
1027  }
1028}
1029
1030static int GetMinGPUMultiprocessorCount(
1031    gpu::Platform* gpu_manager,
1032    const std::vector<CudaGpuId>& visible_gpu_order) {
1033  const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT");
1034
1035  if (tf_min_gpu_core_count == nullptr ||
1036      strcmp(tf_min_gpu_core_count, "") == 0) {
1037    return GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
1038  }
1039
1040  int min_gpu_core_count = -1;
1041  if (strings::safe_strto32(tf_min_gpu_core_count, &min_gpu_core_count)) {
1042    if (min_gpu_core_count >= 0) {
1043      return min_gpu_core_count;
1044    }
1045  }
1046
1047  int count =
1048      GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
1049  LOG(ERROR) << "Invalid minimum GPU multiprocessor count: ["
1050             << tf_min_gpu_core_count << "]. "
1051             << "Using the default value: " << count;
1052  return count;
1053}
1054
1055namespace {
1056
1057struct CudaVersion {
1058  // Initialize from version_name in the form of "3.5"
1059  explicit CudaVersion(const std::string& version_name) {
1060    size_t dot_pos = version_name.find('.');
1061    CHECK(dot_pos != string::npos)
1062        << "Illegal version name: [" << version_name << "]";
1063    string major_str = version_name.substr(0, dot_pos);
1064    CHECK(strings::safe_strto32(major_str, &major_part))
1065        << "Illegal version name: [" << version_name << "]";
1066    string minor_str = version_name.substr(dot_pos + 1);
1067    CHECK(strings::safe_strto32(minor_str, &minor_part))
1068        << "Illegal version name: [" << version_name << "]";
1069  }
1070  CudaVersion() {}
1071  bool operator<(const CudaVersion& other) const {
1072    if (this->major_part != other.major_part) {
1073      return this->major_part < other.major_part;
1074    }
1075    return this->minor_part < other.minor_part;
1076  }
1077  friend std::ostream& operator<<(std::ostream& os,
1078                                  const CudaVersion& version) {
1079    os << version.major_part << "." << version.minor_part;
1080    return os;
1081  }
1082  int major_part = -1;
1083  int minor_part = -1;
1084};
1085
1086std::vector<CudaVersion> supported_cuda_compute_capabilities = {
1087    TF_CUDA_CAPABILITIES,};
1088
1089std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() {
1090  auto cuda_caps = supported_cuda_compute_capabilities;
1091#ifdef TF_EXTRA_CUDA_CAPABILITIES
1092// TF_EXTRA_CUDA_CAPABILITIES should be defined a sequence separated by commas,
1093// for example:
1094//   TF_EXTRA_CUDA_CAPABILITIES=3.0,4.0,5.0
1095// Use two-level macro expansion for stringification.
1096#define TF_XSTRING(...) #__VA_ARGS__
1097#define TF_STRING(s) TF_XSTRING(s)
1098  string extra_cuda_caps = TF_STRING(TF_EXTRA_CUDA_CAPABILITIES);
1099#undef TF_STRING
1100#undef TF_XSTRING
1101  auto extra_capabilities = str_util::Split(extra_cuda_caps, ',');
1102  for (const auto& capability : extra_capabilities) {
1103    cuda_caps.push_back(CudaVersion(capability));
1104  }
1105#endif
1106  return cuda_caps;
1107}
1108
1109std::unique_ptr<std::map<std::pair<int, int>, bool>> GetPeerAccessMap(
1110    gpu::Platform* platform, const std::vector<CudaGpuId>& visible_gpu_order) {
1111  std::unique_ptr<std::map<std::pair<int, int>, bool>> map(
1112      new std::map<std::pair<int, int>, bool>);
1113  for (int i = 0; i < visible_gpu_order.size(); ++i) {
1114    const CudaGpuId i_gpu_id = visible_gpu_order[i];
1115    for (int j = 0; j < visible_gpu_order.size(); ++j) {
1116      const CudaGpuId j_gpu_id = visible_gpu_order[j];
1117      gpu::StreamExecutor* from =
1118          GpuIdUtil::ExecutorForCudaGpuId(platform, i_gpu_id).ValueOrDie();
1119      gpu::StreamExecutor* to =
1120          GpuIdUtil::ExecutorForCudaGpuId(platform, j_gpu_id).ValueOrDie();
1121      (*map)[{i, j}] = from->CanEnablePeerAccessTo(to);
1122    }
1123  }
1124
1125  return map;
1126}
1127
1128Status EnablePeerAccess(gpu::Platform* platform,
1129                        const std::vector<CudaGpuId>& visible_gpu_order) {
1130  int possible_peer_count = 0;
1131  int enabled_peer_count = 0;
1132  for (int i = 0; i < visible_gpu_order.size(); ++i) {
1133    const CudaGpuId i_gpu_id = visible_gpu_order[i];
1134    for (int j = 0; j < visible_gpu_order.size(); ++j) {
1135      const CudaGpuId j_gpu_id = visible_gpu_order[j];
1136      // We have already validated that ExecutorForDevice() calls return OK.
1137      gpu::StreamExecutor* from =
1138          GpuIdUtil::ExecutorForCudaGpuId(platform, i_gpu_id).ValueOrDie();
1139      gpu::StreamExecutor* to =
1140          GpuIdUtil::ExecutorForCudaGpuId(platform, j_gpu_id).ValueOrDie();
1141
1142      if (from->CanEnablePeerAccessTo(to)) {
1143        ++possible_peer_count;
1144        auto status = from->EnablePeerAccessTo(to);
1145        if (!status.ok()) {
1146          LOG(WARNING)
1147              << "Unable to enable peer access between device ordinals "
1148              << i_gpu_id << " and " << j_gpu_id << ", status: " << status;
1149        } else {
1150          ++enabled_peer_count;
1151        }
1152      }
1153    }
1154  }
1155
1156  // Return an error in the extreme failure case where the driver
1157  // reported that peering was possible but not a single peering was
1158  // successful.  This is to catch possible system misconfigurations
1159  // or more fundamental issues.
1160  if (possible_peer_count > 0 && enabled_peer_count == 0) {
1161    return errors::Internal(possible_peer_count,
1162                            " potential peer access pairs were reported by the "
1163                            "driver, but no peering could be enabled.");
1164  }
1165  return Status::OK();
1166}
1167
1168}  // namespace
1169
1170Status BaseGPUDeviceFactory::GetValidDeviceIds(
1171    const std::vector<CudaGpuId>& visible_gpu_order,
1172    std::vector<CudaGpuId>* ids) {
1173  gpu::Platform* gpu_manager = GPUMachineManager();
1174  bool new_gpu_found = false;
1175  for (int i = 0; i < visible_gpu_order.size(); ++i) {
1176    const CudaGpuId cuda_gpu_id = visible_gpu_order[i];
1177
1178    // Only perform this once per visible cuda gpu id.
1179    if (visible_gpu_initialized_[cuda_gpu_id.value()]) {
1180      continue;
1181    }
1182
1183    visible_gpu_initialized_[cuda_gpu_id.value()] = true;
1184    new_gpu_found = true;
1185
1186    auto executor = GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, cuda_gpu_id);
1187    if (!executor.ok()) {
1188      return StreamExecutorUtil::ConvertStatus(executor.status());
1189    }
1190
1191    auto stream_exec = executor.ValueOrDie();
1192    int64 free_bytes;
1193    int64 total_bytes;
1194    if (!stream_exec->DeviceMemoryUsage(&free_bytes, &total_bytes)) {
1195      // Logs internally on failure.
1196      free_bytes = 0;
1197      total_bytes = 0;
1198    }
1199    const auto& description = stream_exec->GetDeviceDescription();
1200    int cc_major;
1201    int cc_minor;
1202    if (!description.cuda_compute_capability(&cc_major, &cc_minor)) {
1203      // Logs internally on failure.
1204      cc_major = 0;
1205      cc_minor = 0;
1206    }
1207    LOG(INFO) << "Found device " << i << " with properties: "
1208              << "\nname: " << description.name() << " major: " << cc_major
1209              << " minor: " << cc_minor
1210              << " memoryClockRate(GHz): " << description.clock_rate_ghz()
1211              << "\npciBusID: " << description.pci_bus_id() << "\ntotalMemory: "
1212              << strings::HumanReadableNumBytes(total_bytes)
1213              << " freeMemory: " << strings::HumanReadableNumBytes(free_bytes);
1214  }
1215  // Checking peering and shows matrix if more than one gpu found.
1216  if (new_gpu_found && visible_gpu_order.size() > 1) {
1217    // Enable peer access
1218    TF_RETURN_IF_ERROR(EnablePeerAccess(gpu_manager, visible_gpu_order));
1219
1220    // Print out a matrix showing which devices can DMA to one
1221    // another.
1222    LOG(INFO) << "Device peer to peer matrix";
1223    auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
1224    string line_buf = "DMA: ";
1225    for (int i = 0; i < visible_gpu_order.size(); ++i) {
1226      strings::StrAppend(&line_buf, visible_gpu_order[i].value(), " ");
1227    }
1228    LOG(INFO) << line_buf;
1229    for (int i = 0; i < visible_gpu_order.size(); ++i) {
1230      line_buf = strings::StrCat(visible_gpu_order[i].value(), ":   ");
1231      for (int j = 0; j < visible_gpu_order.size(); ++j) {
1232        if ((*access_map)[{i, j}]) {
1233          line_buf.append("Y ");
1234        } else {
1235          line_buf.append("N ");
1236        }
1237      }
1238      LOG(INFO) << line_buf;
1239    }
1240  }
1241
1242  auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities();
1243  if (cuda_supported_capabilities.empty()) {
1244    return errors::FailedPrecondition(
1245        "No supported cuda capabilities in binary.");
1246  }
1247  CudaVersion min_supported_capability = *std::min_element(
1248      cuda_supported_capabilities.begin(), cuda_supported_capabilities.end());
1249
1250  int min_gpu_core_count =
1251      GetMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
1252
1253  // Filter out devices that don't have the right capability or power.
1254  for (int i = 0; i < visible_gpu_order.size(); ++i) {
1255    const CudaGpuId visible_gpu_id = visible_gpu_order[i];
1256    auto exec_status =
1257        GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_id);
1258    if (!exec_status.ok()) {
1259      LOG(INFO) << "Ignoring visible gpu device " << visible_gpu_id
1260                << " whose executor is in invalid state: "
1261                << exec_status.status().ToString();
1262      continue;
1263    }
1264    gpu::StreamExecutor* se = exec_status.ValueOrDie();
1265    const gpu::DeviceDescription& desc = se->GetDeviceDescription();
1266    CudaVersion device_capability;
1267    if (!desc.cuda_compute_capability(&device_capability.major_part,
1268                                      &device_capability.minor_part)) {
1269      LOG(INFO) << "Ignoring visible gpu device "
1270                << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
1271                << ") "
1272                << "whose CUDA compute capability is not available.";
1273      continue;
1274    }
1275    // Only GPUs with no less than the minimum supported compute capability is
1276    // accepted.
1277    if (device_capability < min_supported_capability) {
1278      LOG(INFO) << "Ignoring visible gpu device "
1279                << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
1280                << ") "
1281                << "with Cuda compute capability " << device_capability
1282                << ". The minimum required Cuda capability is "
1283                << min_supported_capability << ".";
1284      continue;
1285    }
1286
1287    // Filter out slow GPUs. By default, GPUs with a lower multiprocessor
1288    // count than the fastest GPU are filtered out, unless they have 8 or more
1289    // multiprocessors. If the TF_MIN_GPU_MULTIPROCESSOR_COUNT environment
1290    // variable is set, its value will be used to filter out GPUs.
1291    if (desc.core_count() < min_gpu_core_count) {
1292      LOG(INFO) << "Ignoring visible gpu device "
1293                << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
1294                << ") "
1295                << "with Cuda multiprocessor count: " << desc.core_count()
1296                << ". The minimum required count is " << min_gpu_core_count
1297                << ". You can adjust this requirement with the env var "
1298                   "TF_MIN_GPU_MULTIPROCESSOR_COUNT.";
1299      continue;
1300    }
1301    ids->push_back(visible_gpu_id);
1302  }
1303  if (!ids->empty()) {
1304    std::vector<int> raw_ids(ids->size());
1305    std::transform(ids->begin(), ids->end(), raw_ids.begin(),
1306                   [](CudaGpuId id) -> int { return id.value(); });
1307    LOG(INFO) << "Adding visible gpu devices: "
1308              << str_util::Join(raw_ids, ", ");
1309  }
1310
1311  return Status::OK();
1312}
1313
1314}  // namespace tensorflow
1315
1316#endif  // GOOGLE_CUDA
1317