1/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#include "tensorflow/stream_executor/kernel_spec.h"
17
18
19namespace perftools {
20namespace gputools {
21
22KernelLoaderSpec::KernelLoaderSpec(port::StringPiece kernelname)
23    : kernelname_(kernelname.ToString()) {}
24
25OnDiskKernelLoaderSpec::OnDiskKernelLoaderSpec(port::StringPiece filename,
26                                               port::StringPiece kernelname)
27    : KernelLoaderSpec(kernelname), filename_(filename.ToString()) {}
28
29CudaPtxOnDisk::CudaPtxOnDisk(port::StringPiece filename,
30                             port::StringPiece kernelname)
31    : OnDiskKernelLoaderSpec(filename, kernelname) {}
32
33CudaCubinOnDisk::CudaCubinOnDisk(port::StringPiece filename,
34                                 port::StringPiece kernelname)
35    : OnDiskKernelLoaderSpec(filename, kernelname) {}
36
37CudaCubinInMemory::CudaCubinInMemory(const char *bytes,
38                                     port::StringPiece kernelname)
39    : KernelLoaderSpec(kernelname), bytes_(bytes) {}
40
41bool CompareComputeCapability(const std::tuple<int, int> &lhs,
42                              const std::tuple<int, int> &rhs) {
43  return std::get<0>(lhs) < std::get<0>(rhs) ||
44         (std::get<0>(lhs) == std::get<0>(rhs) &&
45          std::get<1>(lhs) < std::get<1>(rhs));
46}
47
48const std::tuple<int, int> CudaPtxInMemory::kMinimumCapability{1, 0};
49
50CudaPtxInMemory::CudaPtxInMemory(port::StringPiece ptx,
51                                 port::StringPiece kernel_name,
52                                 bool ptx_compressed)
53    : KernelLoaderSpec(kernel_name),
54      ptx_by_compute_capability_(CompareComputeCapability) {
55  if (ptx_compressed) {
56    // Lazy decompression. Put an empty string in decompressed_ptx_ showing that
57    // the original ptx is compressed.
58    decompressed_ptx_[ptx.data()] = "";
59  }
60  ptx_by_compute_capability_[kMinimumCapability] = ptx.data();
61}
62
63CudaPtxInMemory::CudaPtxInMemory(
64    const std::initializer_list<CudaPtxInMemory::PtxSpec> &spec_list,
65    port::StringPiece kernel_name, bool ptx_compressed)
66    : KernelLoaderSpec(kernel_name),
67      ptx_by_compute_capability_(CompareComputeCapability) {
68  for (const auto &spec : spec_list) {
69    int major, minor;
70    port::StringPiece ptx;
71    std::tie(major, minor, ptx) = spec;
72    if (ptx_compressed) {
73      // Lazy decompression. Put an empty string in decompressed_ptx_ showing
74      // that the original ptx is compressed.
75      decompressed_ptx_[ptx.data()] = "";
76    }
77    ptx_by_compute_capability_[std::tuple<int, int>{major, minor}] = ptx.data();
78  }
79}
80
81string CudaPtxInMemory::DecompressPtx(const char *ptx) {
82  // Get the length of the PTX string from the beginning of the buffer.
83  uint64 ptx_length = *reinterpret_cast<const uint64 *>(ptx);
84  // Get the PTX string from the buffer with offset and length.
85  string compressed_ptx(ptx + sizeof(uint64),
86                        ptx + sizeof(uint64) + ptx_length);
87  string decompressed_ptx;
88  // Decompress the PTX string with bzip2.
89  LOG(FATAL) << "bzip2 decompression is not supported yet.";
90  return decompressed_ptx;
91}
92
93const char *CudaPtxInMemory::default_text() const {
94  if (ptx_by_compute_capability_.empty()) {
95    return nullptr;
96  }
97
98  mutex_lock lock{mu_};
99
100  auto ptx = ptx_by_compute_capability_.begin()->second;
101  // Check if there is an entry in decompressed ptx table.
102  auto decompressed_ptx_iter = decompressed_ptx_.find(ptx);
103  if (decompressed_ptx_iter != decompressed_ptx_.end()) {
104    // If the decompressed string is empty, which means the ptx hasn't been
105    // decompressed, decompress it here.
106    if (decompressed_ptx_iter->second.empty()) {
107      decompressed_ptx_iter->second = DecompressPtx(ptx);
108    }
109    return decompressed_ptx_iter->second.c_str();
110  }
111  return ptx;
112}
113
114const char *CudaPtxInMemory::original_default_text() const {
115  if (ptx_by_compute_capability_.empty()) {
116    return nullptr;
117  }
118
119  return ptx_by_compute_capability_.begin()->second;
120}
121
122const char *CudaPtxInMemory::text(int compute_capability_major,
123                                  int compute_capability_minor) const {
124  std::tuple<int, int> capability{compute_capability_major,
125                                  compute_capability_minor};
126
127  auto ptx_iter = ptx_by_compute_capability_.find(capability);
128  if (ptx_iter == ptx_by_compute_capability_.end()) {
129    return nullptr;
130  }
131
132  mutex_lock lock{mu_};
133
134  // Check if there is an entry in decompressed ptx table.
135  auto decompressed_ptx_iter = decompressed_ptx_.find(ptx_iter->second);
136  if (decompressed_ptx_iter != decompressed_ptx_.end()) {
137    // If the decompressed string is empty, which means the ptx hasn't been
138    // decompressed, decompress it here.
139    if (decompressed_ptx_iter->second.empty()) {
140      decompressed_ptx_iter->second = DecompressPtx(ptx_iter->second);
141    }
142    return decompressed_ptx_iter->second.c_str();
143  }
144  return ptx_iter->second;
145}
146
147const char *CudaPtxInMemory::original_text(int compute_capability_major,
148                                           int compute_capability_minor) const {
149  std::tuple<int, int> capability{compute_capability_major,
150                                  compute_capability_minor};
151
152  auto ptx_iter = ptx_by_compute_capability_.find(capability);
153  if (ptx_iter == ptx_by_compute_capability_.end()) {
154    return nullptr;
155  }
156
157  return ptx_iter->second;
158}
159
160OpenCLTextOnDisk::OpenCLTextOnDisk(port::StringPiece filename,
161                                   port::StringPiece kernelname)
162    : OnDiskKernelLoaderSpec(filename, kernelname) {}
163
164OpenCLTextInMemory::OpenCLTextInMemory(port::StringPiece text,
165                                       port::StringPiece kernelname)
166    : KernelLoaderSpec(kernelname), text_(text.ToString()) {}
167
168OpenCLBinaryOnDisk::OpenCLBinaryOnDisk(port::StringPiece filename,
169                                       port::StringPiece kernelname)
170    : OnDiskKernelLoaderSpec(filename, kernelname) {}
171
172MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLTextOnDisk(
173    port::StringPiece filename, port::StringPiece kernelname) {
174  CHECK(ocl_text_on_disk_ == nullptr);
175  ocl_text_on_disk_.reset(new OpenCLTextOnDisk{filename, kernelname});
176  return this;
177}
178
179MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLBinaryOnDisk(
180    port::StringPiece filename, port::StringPiece kernelname) {
181  CHECK(ocl_binary_on_disk_ == nullptr);
182  ocl_binary_on_disk_.reset(new OpenCLBinaryOnDisk{filename, kernelname});
183  return this;
184}
185
186MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLTextInMemory(
187    port::StringPiece filename, port::StringPiece kernelname) {
188  CHECK(ocl_text_in_memory_ == nullptr);
189  ocl_text_in_memory_.reset(new OpenCLTextInMemory{filename, kernelname});
190  return this;
191}
192
193MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxOnDisk(
194    port::StringPiece filename, port::StringPiece kernelname) {
195  CHECK(cuda_ptx_on_disk_ == nullptr);
196  cuda_ptx_on_disk_.reset(new CudaPtxOnDisk{filename, kernelname});
197  return this;
198}
199
200MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCubinInMemory(
201    const char *bytes, port::StringPiece kernelname) {
202  CHECK(cuda_cubin_in_memory_ == nullptr);
203  cuda_cubin_in_memory_.reset(new CudaCubinInMemory{bytes, kernelname});
204  return this;
205}
206
207MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCubinOnDisk(
208    port::StringPiece filename, port::StringPiece kernelname) {
209  CHECK(cuda_cubin_on_disk_ == nullptr);
210  cuda_cubin_on_disk_.reset(new CudaCubinOnDisk{filename, kernelname});
211  return this;
212}
213
214MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory(
215    port::StringPiece ptx, port::StringPiece kernelname) {
216  CHECK(cuda_ptx_in_memory_ == nullptr);
217  cuda_ptx_in_memory_.reset(
218      new CudaPtxInMemory{ptx, kernelname, false /* ptx_compressed */});
219  return this;
220}
221
222MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory(
223    port::StringPiece ptx, port::StringPiece kernelname) {
224  CHECK(cuda_ptx_in_memory_ == nullptr);
225  cuda_ptx_in_memory_.reset(
226      new CudaPtxInMemory{ptx, kernelname, true /* ptx_compressed */});
227  return this;
228}
229
230MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory(
231    std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
232    port::StringPiece kernelname) {
233  CHECK(cuda_ptx_in_memory_ == nullptr);
234  cuda_ptx_in_memory_.reset(
235      new CudaPtxInMemory{spec_list, kernelname, false /* ptx_compressed */});
236  return this;
237}
238
239MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory(
240    std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
241    port::StringPiece kernelname) {
242  CHECK(cuda_ptx_in_memory_ == nullptr);
243  cuda_ptx_in_memory_.reset(
244      new CudaPtxInMemory{spec_list, kernelname, true /* ptx_compressed */});
245  return this;
246}
247
248MultiKernelLoaderSpec::MultiKernelLoaderSpec(size_t arity) : arity_(arity) {}
249
250}  // namespace gputools
251}  // namespace perftools
252