1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3Licensed under the Apache License, Version 2.0 (the "License"); 4you may not use this file except in compliance with the License. 5You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9Unless required by applicable law or agreed to in writing, software 10distributed under the License is distributed on an "AS IS" BASIS, 11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12See the License for the specific language governing permissions and 13limitations under the License. 14==============================================================================*/ 15 16#include "tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h" 17 18#include <string> 19#include <utility> 20#include <vector> 21 22#include "llvm/IR/DataLayout.h" 23#include "tensorflow/compiler/xla/literal_util.h" 24#include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h" 25#include "tensorflow/compiler/xla/shape_util.h" 26#include "tensorflow/compiler/xla/status_macros.h" 27#include "tensorflow/compiler/xla/statusor.h" 28#include "tensorflow/compiler/xla/types.h" 29#include "tensorflow/compiler/xla/util.h" 30#include "tensorflow/compiler/xla/xla_data.pb.h" 31#include "tensorflow/core/lib/core/errors.h" 32#include "tensorflow/core/lib/gtl/cleanup.h" 33#include "tensorflow/core/platform/logging.h" 34#include "tensorflow/core/platform/stream_executor_no_cuda.h" 35 36namespace se = ::perftools::gputools; 37 38namespace xla { 39 40// TODO(b/30467474) Once GPU infeed implementation settles, consider 41// folding back the cpu and gpu infeed implementations into a generic 42// one if possible. 43GpuTransferManager::GpuTransferManager() 44 : GenericTransferManager( 45 se::cuda::kCudaPlatformId, 46 /*pointer_size=*/llvm::DataLayout(gpu::GpuCompiler::kDataLayout) 47 .getPointerSize(0 /* default address space */)) {} 48 49Status GpuTransferManager::TransferLiteralToInfeed(se::StreamExecutor* executor, 50 const Literal& literal) { 51 const Shape& shape = literal.shape(); 52 VLOG(2) << "Transferring literal to infeed with shape: " 53 << ShapeUtil::HumanString(shape); 54 55 if (!ShapeUtil::IsTuple(shape)) { 56 int64 size = GetByteSizeRequirement(shape); 57 return TransferBufferToInfeed(executor, size, literal.untyped_data()); 58 } 59 60 if (ShapeUtil::IsNestedTuple(shape)) { 61 return Unimplemented( 62 "Infeed with a nested tuple shape is not supported: %s", 63 ShapeUtil::HumanString(literal.shape()).c_str()); 64 } 65 66 // For a tuple, we transfer each of its elements to the device and 67 // enqueue the resulting destination device addresses with the 68 // infeed manager. 69 std::vector<gpu::InfeedBuffer*> buffers; 70 buffers.reserve(ShapeUtil::TupleElementCount(shape)); 71 auto cleanup = tensorflow::gtl::MakeCleanup([buffers]() { 72 for (gpu::InfeedBuffer* b : buffers) { 73 b->Done(); 74 } 75 }); 76 77 for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) { 78 const Shape& tuple_element_shape = 79 ShapeUtil::GetTupleElementShape(shape, i); 80 int64 tuple_element_size = GetByteSizeRequirement(tuple_element_shape); 81 TF_ASSIGN_OR_RETURN( 82 gpu::InfeedBuffer * buffer, 83 TransferBufferToInfeedInternal(executor, tuple_element_size, 84 literal.untyped_data({i}))); 85 buffers.push_back(buffer); 86 } 87 88 cleanup.release(); 89 return EnqueueBuffersToInfeed(executor, buffers); 90} 91 92Status GpuTransferManager::TransferBufferToInfeed(se::StreamExecutor* executor, 93 int64 size, 94 const void* source) { 95 TF_ASSIGN_OR_RETURN(gpu::InfeedBuffer * buffer, 96 TransferBufferToInfeedInternal(executor, size, source)); 97 return EnqueueBuffersToInfeed(executor, {buffer}); 98} 99 100Status GpuTransferManager::EnqueueBuffersToInfeed( 101 se::StreamExecutor* executor, std::vector<gpu::InfeedBuffer*> buffers) { 102 gpu::InfeedManager* infeed_manager = gpu::GetOrCreateInfeedManager(); 103 se::Stream* stream = infeed_manager->GetStream(executor); 104 105 // TODO(b/30467474): Since this stream is shared across different 106 // infeed requests, blocking on the stream might be 107 // heavy-handed. Figure out if finer-grained acknowledgement is 108 // possible. 109 Status block_status = stream->BlockHostUntilDone(); 110 if (!block_status.ok()) { 111 for (gpu::InfeedBuffer* b : buffers) { 112 b->Done(); 113 } 114 return InternalError("Failed to complete data transfer on stream %p: %s", 115 stream, block_status.error_message().c_str()); 116 } 117 118 infeed_manager->EnqueueBuffers(buffers); 119 120 VLOG(2) << "Infeed data transferred"; 121 122 return Status::OK(); 123} 124 125StatusOr<gpu::InfeedBuffer*> GpuTransferManager::TransferBufferToInfeedInternal( 126 se::StreamExecutor* executor, int64 size, const void* source) { 127 if (size > std::numeric_limits<int32>::max()) { 128 return InvalidArgument("Infeed shape is too large: needs %lld bytes", size); 129 } 130 131 if (size == 0) { 132 return InvalidArgument("Infeed shape needs 0 bytes"); 133 } 134 135 gpu::InfeedManager* infeed_manager = gpu::GetOrCreateInfeedManager(); 136 se::Stream* stream = infeed_manager->GetStream(executor); 137 if (stream == nullptr) { 138 return InternalError("Failed to obtain a stream"); 139 } 140 141 gpu::InfeedBuffer* buffer = new gpu::InfeedBuffer(executor, size); 142 stream->ThenMemcpy(buffer->device_memory(), source, size); 143 144 VLOG(2) << "Queued infeed data on stream " << stream; 145 146 return buffer; 147} 148 149} // namespace xla 150 151static std::unique_ptr<xla::TransferManager> CreateGpuTransferManager() { 152 return xla::MakeUnique<xla::GpuTransferManager>(); 153} 154 155static bool InitModule() { 156 xla::TransferManager::RegisterTransferManager(se::cuda::kCudaPlatformId, 157 &CreateGpuTransferManager); 158 return true; 159} 160static bool module_initialized = InitModule(); 161