1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
17
18#include <set>
19#include <utility>
20#include <vector>
21
22#include "tensorflow/compiler/xla/map_util.h"
23#include "tensorflow/compiler/xla/ptr_util.h"
24#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
25#include "tensorflow/compiler/xla/service/hlo_computation.h"
26#include "tensorflow/compiler/xla/service/hlo_instruction.h"
27#include "tensorflow/compiler/xla/service/logical_buffer.h"
28#include "tensorflow/compiler/xla/service/shaped_buffer.h"
29#include "tensorflow/compiler/xla/service/transfer_manager.h"
30#include "tensorflow/compiler/xla/shape_tree.h"
31#include "tensorflow/compiler/xla/shape_util.h"
32#include "tensorflow/compiler/xla/status_macros.h"
33#include "tensorflow/compiler/xla/util.h"
34#include "tensorflow/core/platform/logging.h"
35#include "tensorflow/core/platform/types.h"
36
37namespace se = ::perftools::gputools;
38
39namespace xla {
40namespace gpu {
41namespace {
42
43// A helper class for profiling HLO in the course of GPU program execution.
44// All of the profiling is guarded internally, to avoid the caller needing to
45// have lots of conditionals sprinkled around.
46class HloExecutionProfiler {
47 public:
48  // If profiling is enabled, start an execution timer running.
49  explicit HloExecutionProfiler(bool do_profile, HloExecutionProfile* profile,
50                                se::Stream* stream,
51                                const HloComputation* computation)
52      : do_profile_(do_profile),
53        profile_(profile),
54        stream_(stream),
55        computation_(computation) {
56    if (do_profile_) {
57      clock_rate_ghz_ =
58          stream->parent()->GetDeviceDescription().clock_rate_ghz();
59      execution_timer_.reset(new se::Timer(stream->parent()));
60      per_op_timer_.reset(new se::Timer(stream->parent()));
61      stream->InitTimer(execution_timer_.get())
62          .ThenStartTimer(execution_timer_.get());
63      stream->InitTimer(per_op_timer_.get());
64    }
65  }
66
67  // If profiling is enabled, sets the total cycle count on the profile from the
68  // execution timer.
69  void FinishExecution() {
70    CHECK(!finished_execution_) << "Call FinishExecution only once!";
71    finished_execution_ = true;
72    if (do_profile_) {
73      stream_->ThenStopTimer(execution_timer_.get());
74      stream_->BlockHostUntilDone().IgnoreError();
75      profile_->set_total_cycles_executed(
76          *computation_, execution_timer_->Nanoseconds() * clock_rate_ghz_);
77    }
78  }
79
80  // If profiling is enabled, starts the per-operation timer.
81  void StartOperation() {
82    if (do_profile_) {
83      stream_->ThenStartTimer(per_op_timer_.get());
84    }
85  }
86
87  // If profiling is enabled, stops the per-operation timer and records the time
88  // that the hlo_instruction took to execute in the profile.
89  void FinishOperation(const HloInstruction* hlo_instruction) {
90    if (do_profile_) {
91      stream_->ThenStopTimer(per_op_timer_.get());
92      stream_->BlockHostUntilDone().IgnoreError();
93      profile_->SetCyclesTakenBy(
94          hlo_instruction, per_op_timer_->Nanoseconds() * clock_rate_ghz_);
95    }
96  }
97
98 private:
99  const bool do_profile_;
100  double clock_rate_ghz_;
101  HloExecutionProfile* profile_;
102  se::Stream* stream_;
103  const HloComputation* computation_;
104  std::unique_ptr<se::Timer> execution_timer_;
105  std::unique_ptr<se::Timer> per_op_timer_;
106  bool finished_execution_ = false;
107};
108
109}  // namespace
110
111// Implementation note: HLO profiling is always enabled for GPU executables,
112// since we can use timers around thunks.
113GpuExecutable::GpuExecutable(
114    const string& ptx, const std::vector<uint8>& cubin,
115    std::pair<int, int> compute_capability,
116    std::unique_ptr<const ThunkSchedule> thunk_schedule,
117    std::unique_ptr<const HloModule> hlo_module,
118    std::unique_ptr<const BufferAssignment> assignment,
119    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
120    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
121    : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
122                 std::move(hlo_profile_index_map)),
123      ptx_(ptx),
124      cubin_(cubin),
125      compute_capability_(compute_capability),
126      thunk_schedule_(std::move(thunk_schedule)),
127      assignment_(std::move(assignment)) {}
128
129Status GpuExecutable::ExecuteThunks(
130    const ServiceExecutableRunOptions* run_options,
131    const BufferAllocations& buffer_allocations, bool block_host_until_done,
132    HloExecutionProfile* hlo_execution_profile) {
133  se::Stream* main_stream = run_options->stream();
134
135  std::pair<int, int> stream_compute_compatibility;
136  main_stream->parent()->GetDeviceDescription().cuda_compute_capability(
137      &stream_compute_compatibility.first,
138      &stream_compute_compatibility.second);
139  TF_RET_CHECK(stream_compute_compatibility == compute_capability_)
140      << "Compute capability mismatch; expected {" << compute_capability_.first
141      << ", " << compute_capability_.second << "}, but was {"
142      << stream_compute_compatibility.first << ", "
143      << stream_compute_compatibility.second << "}";
144
145  bool do_profile = hlo_execution_profile != nullptr;
146  if (do_profile) {
147    LOG(WARNING) << "PROFILING: profiling is enabled";
148  }
149
150  HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
151                                hlo_module_->entry_computation());
152
153  uint64 start_micros = tensorflow::Env::Default()->NowMicros();
154
155  // Stream 0 indicates `main_stream` and substreams start from stream 1.
156  std::vector<Pool<se::Stream>::SmartPtr> sub_streams;
157  while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) {
158    sub_streams.emplace_back();
159    TF_ASSIGN_OR_RETURN(
160        sub_streams.back(),
161        run_options->BorrowStream(main_stream->parent()->device_ordinal()));
162  }
163
164  // The next event enqueued on stream N must not run until the thunk at
165  // last_blocking_thunk_for_stream[N] completes.
166  std::map<int32, const Thunk*> last_blocking_thunk_for_stream;
167  std::map<const Thunk*, std::unique_ptr<se::Event>> thunk_to_finish_event;
168  for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
169    TF_RETURN_IF_ERROR(thunk->Initialize(*this));
170    int32 stream_no =
171        thunk_schedule_->StreamNumberForHlo(*thunk->hlo_instruction());
172    se::Stream* stream =
173        (stream_no == 0 ? main_stream : sub_streams[stream_no - 1].get());
174
175    for (const Thunk* dependency : thunk_schedule_->DependsOn(thunk)) {
176      stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get());
177    }
178
179    if (last_blocking_thunk_for_stream.count(stream_no)) {
180      stream->ThenWaitFor(FindOrDie(thunk_to_finish_event,
181                                    last_blocking_thunk_for_stream[stream_no])
182                              .get());
183      last_blocking_thunk_for_stream.erase(stream_no);
184    }
185
186    // If this thunk requests it, wait for all currently-executing thunks to
187    // finish.  This is useful e.g. if the thunk is about to perform autotuning.
188    if (thunk->ShouldHaltAllActivityBeforeRunning(stream)) {
189      TF_RETURN_IF_ERROR(main_stream->BlockHostUntilDone());
190      last_blocking_thunk_for_stream.clear();
191    }
192
193    profiler.StartOperation();
194    VLOG(2) << "Executing the thunk for "
195            << thunk->hlo_instruction()->ToString() << " on stream "
196            << stream_no;
197    TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream));
198    if (thunk_schedule_->Depended(thunk) || thunk->ShouldBlockFutureThunks()) {
199      auto finish_event = MakeUnique<se::Event>(main_stream->parent());
200      finish_event->Init();
201      stream->ThenRecordEvent(finish_event.get());
202      thunk_to_finish_event[thunk] = std::move(finish_event);
203
204      if (thunk->ShouldBlockFutureThunks()) {
205        // Set last_blocking_thunk_for_stream on all streams other than this one
206        // so that all other streams will wait for this thunk to complete before
207        // executing any events that occur later in the total order.
208        for (int32 i = 0; i < sub_streams.size() + 1; ++i) {
209          if (i != stream_no) {
210            last_blocking_thunk_for_stream[i] = thunk;
211          }
212        }
213      }
214    }
215    profiler.FinishOperation(thunk->hlo_instruction());
216  }
217
218  main_stream->ThenWaitFor(&sub_streams);
219  // Make sure kernels are completed before deallocating temporary buffers.
220  // TODO(b/30100571): we could potentially postpone deallocating the temp
221  // buffers until a different computation is executed.
222  if (block_host_until_done) {
223    Status block_status = main_stream->BlockHostUntilDone();
224    if (!block_status.ok()) {
225      return InternalError(
226          "Failed to complete all kernels launched on stream %p: %s",
227          main_stream, block_status.error_message().c_str());
228    }
229  }
230
231  profiler.FinishExecution();
232  uint64 end_micros = tensorflow::Env::Default()->NowMicros();
233
234  {
235    tensorflow::mutex_lock lock(mutex_);
236    const double nanoseconds = (end_micros - start_micros) * 1000.0;
237    execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
238
239    // If hlo profiling was disabled then the cycle count is left empty.
240    if (do_profile) {
241      execution_profile_.set_compute_cycle_count(
242          hlo_execution_profile->total_cycles_executed(
243              *module().entry_computation()));
244    }
245  }
246
247  return Status::OK();
248}
249
250StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
251    const ServiceExecutableRunOptions* run_options,
252    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
253    HloExecutionProfile* hlo_execution_profile) {
254  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
255
256  if (GetRootPointsToSet().IsAmbiguous()) {
257    return Unimplemented("Points-to set of root instruction is ambiguous");
258  }
259
260  BufferAllocations::Builder buffer_allocations_builder;
261  for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
262       ++i) {
263    const BufferAllocation& allocation = assignment_->GetAllocation(i);
264    if (allocation.is_entry_computation_parameter()) {
265      // The caller must give us a buffer for ShapeIndex {} of every parameter.
266      // It can optionally give us a buffer for other ShapeIndices, but we
267      // ignore them: Because we can't rely on these sub-buffers' addresses
268      // being available, our generated code can't use them.  Instead, it must
269      // chase pointers starting at the tuple root.
270      if (allocation.param_shape_index().empty()) {
271        auto param_no = allocation.parameter_number();
272        buffer_allocations_builder.RegisterBuffer(
273            i, arguments[param_no]->root_buffer());
274      }
275    }
276  }
277  se::StreamExecutor* executor = run_options->stream()->parent();
278  TF_ASSIGN_OR_RETURN(
279      auto buffer_allocations,
280      buffer_allocations_builder.Build(*assignment_, executor->device_ordinal(),
281                                       memory_allocator));
282
283  bool block_host_until_done =
284      !memory_allocator->AllowsAsynchronousDeallocation();
285  TF_RETURN_IF_ERROR(ExecuteThunks(run_options, *buffer_allocations,
286                                   block_host_until_done,
287                                   hlo_execution_profile));
288
289  HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
290  auto device_ordinal = executor->device_ordinal();
291  auto shaped_buffer = MakeUnique<ShapedBuffer>(
292      root->shape(), root->shape(), executor->platform(), device_ordinal);
293
294  // Copy DeviceMemoryBase values which contain the array(s) of the result into
295  // the respective location in ShapedBuffer.
296  std::set<se::DeviceMemoryBase> buffers_in_result;
297  TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus(
298      [&buffer_allocations, &buffers_in_result, &shaped_buffer, this](
299          const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
300        const auto& sources = this->GetRootPointsToSet().element(index);
301        // The points-to set is unambiguous so the set should be a
302        // singleton. That is, we know exactly which instruction
303        // produced the array at this element.
304        CHECK_EQ(1, sources.size());
305        auto src_hlo = sources[0]->instruction();
306
307        VLOG(4) << "Looking at: " << sources[0];
308
309        // The source instruction should have a non-parameter buffer
310        // assigned.
311        TF_ASSIGN_OR_RETURN(
312            const BufferAllocation::Slice slice,
313            this->assignment_->GetUniqueSlice(src_hlo, sources[0]->index()));
314        CHECK(!slice.allocation()->is_entry_computation_parameter());
315
316        perftools::gputools::DeviceMemoryBase src_base =
317            buffer_allocations->GetDeviceAddress(slice.index());
318        CHECK(!src_base.is_null() || src_base.size() == 0);
319        *device_memory = src_base;
320        buffers_in_result.insert(src_base);
321        return Status::OK();
322      }));
323  TF_RETURN_IF_ERROR(
324      buffer_allocations->TearDown(buffers_in_result, *assignment_));
325
326  return std::move(shaped_buffer);
327}
328
329StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteAsyncOnStream(
330    const ServiceExecutableRunOptions* run_options,
331    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
332  // TODO(b/30671675): Implement asynchronous execution mode.
333  return Unimplemented(
334      "Asynchronous execution on stream is not yet supported on GPU.");
335}
336
337const PointsToSet& GpuExecutable::GetRootPointsToSet() const {
338  return assignment_->points_to_analysis().GetPointsToSet(
339      module().entry_computation()->root_instruction());
340}
341
342}  // namespace gpu
343}  // namespace xla
344