1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
17
18#include <utility>
19
20#include "tensorflow/compiler/xla/map_util.h"
21#include "tensorflow/compiler/xla/ptr_util.h"
22#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
23#include "tensorflow/compiler/xla/status_macros.h"
24#include "tensorflow/compiler/xla/types.h"
25#include "tensorflow/compiler/xla/util.h"
26#include "tensorflow/core/lib/core/errors.h"
27#include "tensorflow/core/lib/strings/numbers.h"
28#include "tensorflow/core/platform/logging.h"
29#include "tensorflow/core/platform/types.h"
30
31namespace se = ::perftools::gputools;
32
33namespace xla {
34namespace gpu {
35
36void BufferAllocations::Builder::RegisterBuffer(BufferAllocation::Index index,
37                                                se::DeviceMemoryBase address) {
38  InsertOrDie(&registered_buffers_, index, address);
39}
40
41StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
42    const BufferAssignment& buffer_assignment, int device_ordinal,
43    DeviceMemoryAllocator* memory_allocator) {
44  const int64 num_buffers = buffer_assignment.Allocations().size();
45  auto buffer_allocations = WrapUnique(
46      new BufferAllocations(num_buffers, device_ordinal, memory_allocator));
47
48  for (BufferAllocation::Index i = 0; i < num_buffers; ++i) {
49    // If buffer #i's address is already registered (e.g. external arguments or
50    // result buffers), use that registered buffer.
51    if (registered_buffers_.count(i)) {
52      se::DeviceMemoryBase address = FindOrDie(registered_buffers_, i);
53      if (reinterpret_cast<uintptr_t>(address.opaque()) %
54              kCudaMallocAlignBytes !=
55          0) {
56        return InternalError(
57            "Address of registered buffer %lld must be a multiple of %llx, but "
58            "was %p",
59            i, kCudaMallocAlignBytes, address.opaque());
60      }
61      buffer_allocations->SetBuffer(i, FindOrDie(registered_buffers_, i));
62      continue;
63    }
64
65    // Allocate each allocation that might escape, or is the temp buffer.
66    bool seen_temp_buffer = false;
67    const BufferAllocation& allocation = buffer_assignment.GetAllocation(i);
68    if (allocation.maybe_live_out() || allocation.IsPreallocatedTempBuffer()) {
69      const int64 buffer_size = allocation.size();
70      se::DeviceMemoryBase buffer_address;
71      if (buffer_size > 0) {
72        TF_ASSIGN_OR_RETURN(buffer_address, memory_allocator->Allocate(
73                                                device_ordinal, buffer_size));
74        if (buffer_address == nullptr) {
75          return ResourceExhausted(
76              "Out of memory when allocating %s for buffer %lld.",
77              tensorflow::strings::HumanReadableNumBytes(buffer_size).c_str(),
78              i);
79        }
80        if (reinterpret_cast<uintptr_t>(buffer_address.opaque()) %
81                kCudaMallocAlignBytes !=
82            0) {
83          return InternalError(
84              "Address returned by memory_allocator->Allocate must be a "
85              "multiple of %llx, but was %p",
86              kCudaMallocAlignBytes, buffer_address.opaque());
87        }
88      }
89      buffer_allocations->SetBuffer(i, buffer_address);
90      if (allocation.IsPreallocatedTempBuffer()) {
91        if (seen_temp_buffer) {
92          LOG(FATAL) << "Multiple temporary buffers detected.  BufferAssigner "
93                     << "must guarantee at most one temporary buffer.";
94        }
95        seen_temp_buffer = true;
96        buffer_allocations->temp_buffer_base_ = buffer_address;
97      }
98    }
99  }
100
101  if (VLOG_IS_ON(2)) {
102    for (BufferAllocation::Index i = 0; i < num_buffers; ++i) {
103      const auto& buf = buffer_allocations->buffers_[i];
104      VLOG(2) << "Buffer " << i << " -> " << buf.opaque() << " (" << buf.size()
105              << "B)";
106    }
107  }
108
109  return std::move(buffer_allocations);
110}
111
112tensorflow::Status BufferAllocations::TearDown(
113    const std::set<se::DeviceMemoryBase>& live_addresses,
114    const BufferAssignment& buffer_assignment) {
115  // Deallocate temporary buffers.
116  const int64 num_buffers = buffer_assignment.Allocations().size();
117  for (BufferAllocation::Index i = 0; i < num_buffers; ++i) {
118    const BufferAllocation& allocation = buffer_assignment.GetAllocation(i);
119    se::DeviceMemoryBase buffer_address = GetDeviceAddress(allocation.index());
120    // Deallocate buffers marked "maybe_live_out" but aren't actually live out,
121    // and temp buffers.
122    if ((allocation.maybe_live_out() &&
123         !live_addresses.count(buffer_address)) ||
124        allocation.IsPreallocatedTempBuffer()) {
125      TF_RETURN_IF_ERROR(
126          memory_allocator_->Deallocate(device_ordinal_, &buffer_address));
127    }
128  }
129  return tensorflow::Status::OK();
130}
131
132se::DeviceMemoryBase BufferAllocations::GetDeviceAddress(
133    BufferAllocation::Index buffer_index) const {
134  CHECK_GE(buffer_index, 0);
135  CHECK_LT(buffer_index, buffers_.size());
136  return buffers_[buffer_index];
137}
138
139se::DeviceMemoryBase BufferAllocations::GetDeviceAddress(
140    const BufferAllocation::Slice& buffer_slice) const {
141  se::DeviceMemoryBase base = GetDeviceAddress(buffer_slice.index());
142  CHECK_LE(buffer_slice.offset(), base.size());
143  CHECK_LE(buffer_slice.offset() + buffer_slice.size(), base.size());
144  return se::DeviceMemoryBase(
145      static_cast<char*>(base.opaque()) + buffer_slice.offset(),
146      buffer_slice.size(), /*is_sub_buffer=*/true);
147}
148
149void BufferAllocations::SetBuffer(BufferAllocation::Index buffer_index,
150                                  se::DeviceMemoryBase buffer) {
151  CHECK_GE(buffer_index, 0);
152  CHECK_LT(buffer_index, buffers_.size());
153  buffers_[buffer_index] = buffer;
154}
155
156}  // namespace gpu
157}  // namespace xla
158