1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3Licensed under the Apache License, Version 2.0 (the "License"); 4you may not use this file except in compliance with the License. 5You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9Unless required by applicable law or agreed to in writing, software 10distributed under the License is distributed on an "AS IS" BASIS, 11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12See the License for the specific language governing permissions and 13limitations under the License. 14==============================================================================*/ 15 16#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_ 17#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_ 18 19#include <memory> 20#include <string> 21 22#include "tensorflow/compiler/xla/service/buffer_assignment.h" 23#include "tensorflow/compiler/xla/service/device_memory_allocator.h" 24#include "tensorflow/compiler/xla/service/executable.h" 25#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" 26#include "tensorflow/compiler/xla/service/gpu/stream_assignment.h" 27#include "tensorflow/compiler/xla/service/gpu/thunk.h" 28#include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h" 29#include "tensorflow/compiler/xla/service/hlo_execution_profile.h" 30#include "tensorflow/compiler/xla/service/hlo_module.h" 31#include "tensorflow/compiler/xla/service/shaped_buffer.h" 32#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h" 33#include "tensorflow/compiler/xla/statusor.h" 34#include "tensorflow/compiler/xla/types.h" 35#include "tensorflow/core/lib/core/stringpiece.h" 36#include "tensorflow/core/lib/gtl/array_slice.h" 37#include "tensorflow/core/platform/macros.h" 38#include "tensorflow/core/platform/stream_executor_no_cuda.h" 39 40namespace xla { 41namespace gpu { 42 43// GPU-targeting implementation of the XLA Executable interface. 44// 45// Launches the given CUDA kernel via the StreamExecutor. 46// 47// This is an immutable data type after initialization, and thus thread safe. 48class GpuExecutable : public Executable { 49 public: 50 // cubin (i.e. the compiled ptx) may be empty, in which case we leave 51 // compilation up to the GPU driver. 52 GpuExecutable(const string& ptx, const std::vector<uint8>& cubin, 53 std::pair<int, int> compute_capability, 54 std::unique_ptr<const ThunkSchedule> thunk_schedule, 55 std::unique_ptr<const HloModule> hlo_module, 56 std::unique_ptr<const BufferAssignment> assignment, 57 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data, 58 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map); 59 60 // This should be called after set_ir_module_string. 61 const string& ir_module_string() const { return ir_module_string_; } 62 63 // This should be called before ExecuteOnStream. 64 void set_ir_module_string(const string& ir_module_string) { 65 ir_module_string_ = ir_module_string; 66 } 67 68 // Returns the compiled PTX for the computation. 69 tensorflow::StringPiece ptx() const { return ptx_; } 70 71 // Returns the cubin (compiled PTX) stored in this GpuExecutable. May be 72 // empty, in which case compilation is left up to the GPU driver. 73 const std::vector<uint8>& cubin() const { return cubin_; } 74 75 // ExecuteOnStream will fail if the compute capability of the stream doesn't 76 // match the compute capability passed to this object's constructor. 77 StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream( 78 const ServiceExecutableRunOptions* run_options, 79 tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments, 80 HloExecutionProfile* hlo_execution_profile) override; 81 82 StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream( 83 const ServiceExecutableRunOptions* run_options, 84 tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override; 85 86 const Status EqualOrFail(const Executable& executable) { 87 // TODO(b/62952745) Implement equality test on GPU executable. 88 return Unimplemented("Equality test on GPU executable is not implemented."); 89 } 90 91 private: 92 // If `block_host_until_done` is false, execution will not block the host 93 // until the kernels have completed. This is used as an optimization for 94 // clients, such as Tensorflow, that use a single stream of execution for 95 // computations, and allow host-side deallocation from the allocator before 96 // GPU execution completes. 97 Status ExecuteThunks(const ServiceExecutableRunOptions* run_options, 98 const BufferAllocations& buffer_allocations, 99 bool block_host_until_done, 100 HloExecutionProfile* hlo_execution_profile); 101 102 // Returns the points-to set of the root instruction of the entry 103 // computation. Uses points-to analysis from buffer assignment. 104 const PointsToSet& GetRootPointsToSet() const; 105 106 // The LLVM IR, in string format, of the unoptimized module generated for this 107 // GpuExecutable. We save a string instead of an llvm::Module* because leaving 108 // llvm::Module* in a singleton can cause the heap checker to emit false 109 // positives. 110 // 111 // This string should be modified only before ExecuteOnStream. 112 string ir_module_string_; 113 114 // The PTX for the computation. 115 const string ptx_; 116 117 // The GPU machine code for the computation, targeting GPUs at 118 // compute_capability_. 119 // 120 // May be empty, in which case we leave compilation up to the GPU driver. 121 const std::vector<uint8> cubin_; 122 123 // The compute capability of the GPU we're targeting with this GpuExecutable. 124 std::pair<int, int> compute_capability_; 125 126 // The thunks to be invoked by this GpuExecutable. They are generated by the 127 // IrEmitter. 128 const std::unique_ptr<const ThunkSchedule> thunk_schedule_; 129 130 // Owns the buffer data at runtime. It provides information to allocate 131 // memory for every output/temp buffers. 132 const std::unique_ptr<const BufferAssignment> assignment_; 133 134 TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable); 135}; 136 137} // namespace gpu 138} // namespace xla 139 140#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_ 141