1/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h"
17#include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
18#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
19#include "tensorflow/core/lib/gtl/optional.h"
20#include "tensorflow/core/lib/strings/numbers.h"
21#include "tensorflow/core/lib/strings/strcat.h"
22
23namespace xla {
24namespace gpu {
25namespace {
26
27namespace se = perftools::gputools;
28
29using se::DeviceMemoryBase;
30using se::dnn::AlgorithmConfig;
31using se::dnn::AlgorithmDesc;
32using tensorflow::gtl::nullopt;
33using tensorflow::gtl::optional;
34
35class ScratchAllocator : public se::ScratchAllocator {
36 public:
37  ScratchAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator)
38      : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
39
40  ~ScratchAllocator() override;
41
42  int64 GetMemoryLimitInBytes(se::Stream* stream) override {
43    return 1LL << 32;  // 4GB.  TODO(jlebar): Tune this?
44  }
45  int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
46
47  se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
48      se::Stream* stream, int64 byte_size) override;
49
50 private:
51  const int device_ordinal_;
52  DeviceMemoryAllocator* memory_allocator_;
53  std::vector<se::DeviceMemoryBase> allocated_buffers_;
54  int64 total_allocated_bytes_ = 0;
55};
56
57ScratchAllocator::~ScratchAllocator() {
58  for (auto& allocated_buffer : allocated_buffers_) {
59    if (!memory_allocator_->Deallocate(device_ordinal_, &allocated_buffer)
60             .ok()) {
61      // The program can still continue with failed deallocation.
62      LOG(ERROR) << "Failed to deallocate the allocated buffer: "
63                 << allocated_buffer.opaque();
64    }
65  }
66}
67
68se::port::StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
69    se::Stream* stream, int64 byte_size) {
70  CHECK_GE(byte_size, 0) << "byte_size must be positive.";
71  if (byte_size > GetMemoryLimitInBytes(stream)) {
72    return se::port::Status(
73        se::port::error::RESOURCE_EXHAUSTED,
74        tensorflow::strings::Printf(
75            "Allocating %lld bytes exceeds the memory limit of %lld bytes.",
76            byte_size, GetMemoryLimitInBytes(stream)));
77  }
78
79  auto status_or_memory =
80      memory_allocator_->Allocate(device_ordinal_, byte_size,
81                                  /*retry_on_failure=*/false);
82  if (!status_or_memory.ok()) {
83    return se::port::Status(se::port::error::RESOURCE_EXHAUSTED,
84                            tensorflow::strings::Printf(
85                                "Failed to allocate %lld bytes on device %d.",
86                                byte_size, device_ordinal_));
87  }
88  se::DeviceMemoryBase allocated_buffer = status_or_memory.ValueOrDie();
89  allocated_buffers_.push_back(allocated_buffer);
90  total_allocated_bytes_ += byte_size;
91  return se::DeviceMemory<uint8>(allocated_buffer);
92}
93
94// Determines whether we can safely perform a winograd non-fused convolution for
95// the given input and output shapes.  This works around b/68264959, an integer
96// overflow in cuDNNv5 and cuDNNv6.
97//
98// TODO(jlebar): We shouldn't need this check for cuDNNv7.
99bool ShouldIncludeWinogradNonfusedAlgo(
100    const Shape& input_shape, const Shape& output_shape,
101    const ConvolutionDimensionNumbers& dnums) {
102  int64 batch = input_shape.dimensions(dnums.input_batch_dimension());
103  int64 in_depths = input_shape.dimensions(dnums.input_feature_dimension());
104  int64 in_rows = input_shape.dimensions(dnums.input_spatial_dimensions(0));
105  int64 in_cols =
106      dnums.input_spatial_dimensions_size() == 1
107          ? 1
108          : input_shape.dimensions(dnums.input_spatial_dimensions(1));
109  int64 out_depths = output_shape.dimensions(dnums.output_feature_dimension());
110
111  int64 total_size = CeilOfRatio(batch, int64{16}) *
112                     std::max(in_depths, out_depths) * in_cols * in_rows *
113                     sizeof(float);
114
115  const int64 threshold = 1L << 31;
116  return total_size < threshold;
117}
118
119std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
120                                         bool with_winograd_nonfused,
121                                         se::StreamExecutor* stream_exec_) {
122  std::vector<AlgorithmDesc> algorithms;
123  switch (kind) {
124    case CudnnConvKind::kBackwardFilter:
125      CHECK(stream_exec_->GetConvolveBackwardFilterAlgorithms(
126          with_winograd_nonfused, &algorithms));
127      break;
128    case CudnnConvKind::kBackwardInput:
129      CHECK(stream_exec_->GetConvolveBackwardDataAlgorithms(
130          with_winograd_nonfused, &algorithms));
131      break;
132    case CudnnConvKind::kForward:
133      CHECK(stream_exec_->GetConvolveAlgorithms(with_winograd_nonfused,
134                                                &algorithms));
135      break;
136  }
137
138  return algorithms;
139}
140
141string AlgorithmToString(const AlgorithmDesc& algo) {
142  if (algo.tensor_ops_enabled()) {
143    return tensorflow::strings::StrCat(algo.algo_id(), "+TC");
144  }
145  return tensorflow::strings::StrCat(algo.algo_id());
146}
147
148string NumBytesToString(int64 bytes) {
149  return tensorflow::strings::StrCat(
150      tensorflow::strings::HumanReadableNumBytes(bytes), " (", bytes, "B)");
151}
152
153}  // anonymous namespace
154
155// We could have caching here so that we don't redo this work for two identical
156// convolutions.  Unfortunately our cache key would have to be a tuple
157// containing the protos passed to this function, and we have no utility for
158// hashing protos.  We could write our own hash functions, but they'd silently
159// break if we ever added a field to one of the protos.  Perhaps we could hack
160// using the binary-encoded proto as the hash key, on the assumption that two
161// protos being binary-equal is a sufficient, if not necessary, condition for
162// proper equality.  But that would still leave us open to having unnecessary
163// cache misses and doing extra work.  Overall, caching doesn't seem worth the
164// trouble, but we may want to revisit this if we ever find a model where
165// caching would speed up compilation a lot.
166optional<std::tuple<int64, bool, int64>>
167CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
168    CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
169    const Shape& output_shape, const Window& window,
170    const ConvolutionDimensionNumbers& dnums, HloInstruction* instr) {
171  // Create a stream for us to do our work on.
172  se::Stream stream{stream_exec_};
173  stream.Init();
174  const auto device_ordinal = stream_exec_->device_ordinal();
175
176  // allocator either points to this->allocator_ or, if that's null, to a
177  // StreamExecutorMemoryAllocator for stream_exec_.
178  DeviceMemoryAllocator* allocator;
179  optional<StreamExecutorMemoryAllocator> se_allocator;
180  if (allocator_ != nullptr) {
181    allocator = allocator_;
182  } else {
183    se_allocator.emplace(
184        stream_exec_->platform(),
185        tensorflow::gtl::ArraySlice<se::StreamExecutor*>({stream_exec_}));
186    allocator = &*se_allocator;
187  }
188
189  // Allocate space for the input, filter, and output of the convolution.  We
190  // use a ScratchAllocator for this instead of calling allocator_ directly so
191  // that our allocations don't leak.
192  //
193  // We don't put any data in these buffers, because (in theory, anyway) the
194  // speed of a conv isn't affected by the data being convolved.
195  ScratchAllocator input_output_allocator(device_ordinal, allocator);
196  se::port::StatusOr<DeviceMemoryBase> input_buf =
197      input_output_allocator.AllocateBytes(&stream,
198                                           ShapeUtil::ByteSizeOf(input_shape));
199  se::port::StatusOr<DeviceMemoryBase> filter_buf =
200      input_output_allocator.AllocateBytes(&stream,
201                                           ShapeUtil::ByteSizeOf(filter_shape));
202  se::port::StatusOr<DeviceMemoryBase> output_buf =
203      input_output_allocator.AllocateBytes(&stream,
204                                           ShapeUtil::ByteSizeOf(output_shape));
205  if (!input_buf.ok() || !filter_buf.ok() || !output_buf.ok()) {
206    LOG(WARNING)
207        << "Couldn't allocate space for input/filter/output of convolution "
208        << instr->ToString() << ".  Falling back to default algorithm.";
209    return nullopt;
210  }
211
212  const bool use_winograd_nonfused =
213      ShouldIncludeWinogradNonfusedAlgo(input_shape, output_shape, dnums);
214  se::dnn::ProfileResult best_result;
215  int64 best_result_bytes_used = 0;
216
217  for (const AlgorithmDesc& alg :
218       GetAlgorithms(kind, use_winograd_nonfused, stream_exec_)) {
219    ScratchAllocator scratch_allocator(device_ordinal, allocator);
220    se::dnn::ProfileResult profile_result;
221    VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
222            << instr->ToString();
223
224    bool launch_ok = RunCudnnConvolution(
225                         kind, input_shape, filter_shape, output_shape,
226                         input_buf.ValueOrDie(), filter_buf.ValueOrDie(),
227                         output_buf.ValueOrDie(), &scratch_allocator, window,
228                         dnums, AlgorithmConfig(alg), &stream, &profile_result)
229                         .ok();
230
231    if (launch_ok && profile_result.is_valid()) {
232      int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
233      VLOG(3) << "Run of algorithm " << AlgorithmToString(alg)
234              << " succeeded, taking " << profile_result.elapsed_time_in_ms()
235              << "ms and using " << NumBytesToString(scratch_bytes_used)
236              << " of scratch (Best result: "
237              << best_result.elapsed_time_in_ms() << "ms, "
238              << NumBytesToString(best_result_bytes_used) << " of scratch)";
239      if (profile_result.elapsed_time_in_ms() <
240          best_result.elapsed_time_in_ms()) {
241        best_result = profile_result;
242        best_result_bytes_used = scratch_bytes_used;
243      }
244    } else {
245      VLOG(3) << "Run of algorithm " << AlgorithmToString(alg) << " failed.";
246    }
247  }
248  if (best_result.is_valid()) {
249    VLOG(2) << "Best algorithm for " << instr->ToString() << ": "
250            << AlgorithmToString(best_result.algorithm()) << ", takes "
251            << best_result.elapsed_time_in_ms() << "ms, and uses "
252            << best_result_bytes_used << "B of scratch memory.";
253    return std::make_tuple(best_result.algorithm().algo_id(),
254                           best_result.algorithm().tensor_ops_enabled(),
255                           best_result_bytes_used);
256  }
257
258  LOG(WARNING) << "All algorithms tried for convolution " << instr->ToString()
259               << " failed.  Falling back to default algorithm.";
260  return nullopt;
261}
262
263StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnInstruction(
264    HloInstruction* instr) {
265  CHECK(IsCustomCallToDnnConvolution(*instr));
266
267  const auto& call_target = instr->custom_call_target();
268  const auto& lhs_shape = instr->operand(0)->shape();
269  const auto& rhs_shape = instr->operand(1)->shape();
270  const auto& conv_result_shape = instr->shape().tuple_shapes(0);
271  optional<std::tuple<int64, bool, int64>> alg_scratch_and_tc;
272  if (call_target == kCudnnConvForwardCallTarget) {
273    alg_scratch_and_tc = PickBestAlgorithm(
274        CudnnConvKind::kForward, /*input_shape=*/lhs_shape,
275        /*filter_shape=*/rhs_shape, /*output_shape=*/conv_result_shape,
276        instr->window(), instr->convolution_dimension_numbers(), instr);
277  } else if (call_target == kCudnnConvBackwardInputCallTarget) {
278    alg_scratch_and_tc = PickBestAlgorithm(
279        CudnnConvKind::kBackwardInput, /*input_shape=*/conv_result_shape,
280        /*filter_shape=*/rhs_shape, /*output_shape=*/lhs_shape, instr->window(),
281        instr->convolution_dimension_numbers(), instr);
282  } else if (call_target == kCudnnConvBackwardFilterCallTarget) {
283    alg_scratch_and_tc = PickBestAlgorithm(
284        CudnnConvKind::kBackwardFilter, /*input_shape=*/lhs_shape,
285        /*filter_shape=*/conv_result_shape, /*output_shape=*/rhs_shape,
286        instr->window(), instr->convolution_dimension_numbers(), instr);
287  } else {
288    LOG(FATAL) << "Unknown custom call target for cudnn conv: "
289               << instr->ToString();
290  }
291
292  if (!alg_scratch_and_tc.has_value()) {
293    return false;
294  }
295
296  int64 algorithm;
297  bool tensor_ops_enabled;
298  int64 scratch_bytes;
299
300  std::tie(algorithm, tensor_ops_enabled, scratch_bytes) = *alg_scratch_and_tc;
301
302  VLOG(1) << "Setting cudnn conv to use algorithm " << algorithm << " and "
303          << NumBytesToString(scratch_bytes)
304          << " of scratch memory: " << instr->ToString()
305          << " tensor_ops_enabled: " << tensor_ops_enabled;
306
307  // Replace instr with a new CustomCall which has the correct algorithm, and
308  // whose output shape has the appropriate amount of scratch memory.
309  HloComputation* computation = instr->parent();
310  Shape new_call_shape =
311      ShapeUtil::MakeTupleShape({instr->shape().tuple_shapes(0),
312                                 ShapeUtil::MakeShape(U8, {scratch_bytes})});
313  HloInstruction* algorithm_hlo = computation->AddInstruction(
314      HloInstruction::CreateConstant(Literal::CreateR0<int64>(algorithm)));
315  HloInstruction* tensor_ops_enabled_hlo =
316      computation->AddInstruction(HloInstruction::CreateConstant(
317          Literal::CreateR0<bool>(tensor_ops_enabled)));
318
319  HloInstruction* new_call =
320      computation->AddInstruction(HloInstruction::CreateCustomCall(
321          new_call_shape,
322          {instr->mutable_operand(0), instr->mutable_operand(1), algorithm_hlo,
323           tensor_ops_enabled_hlo},
324          instr->custom_call_target()));
325  new_call->set_window(instr->window());
326  new_call->set_convolution_dimension_numbers(
327      instr->convolution_dimension_numbers());
328
329  // Repackage new_call so it has the same shape as the original call, namely
330  // (conv_result, u8[0]).
331  HloInstruction* new_tuple =
332      computation->AddInstruction(HloInstruction::CreateTuple(
333          {computation->AddInstruction(HloInstruction::CreateGetTupleElement(
334               new_call_shape.tuple_shapes(0), new_call, 0)),
335           computation->AddInstruction(
336               HloInstruction::CreateConstant(Literal::CreateR1<uint8>({})))}));
337
338  TF_RETURN_IF_ERROR(instr->parent()->ReplaceInstruction(instr, new_tuple));
339  return true;
340}
341
342StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnComputation(
343    HloComputation* computation) {
344  std::vector<HloInstruction*> convs;
345  for (auto* instr : computation->instructions()) {
346    if (IsCustomCallToDnnConvolution(*instr)) {
347      convs.push_back(instr);
348    }
349  }
350
351  bool changed = false;
352  for (auto* instr : convs) {
353    TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(instr));
354    changed |= result;
355  }
356  return changed;
357}
358
359StatusOr<bool> CudnnConvolutionAlgorithmPicker::Run(HloModule* module) {
360  bool changed = false;
361  for (HloComputation* computation : module->MakeNonfusionComputations()) {
362    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
363    changed |= result;
364  }
365  return changed;
366}
367
368}  // namespace gpu
369}  // namespace xla
370