11e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
21e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
31e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsLicensed under the Apache License, Version 2.0 (the "License");
41e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsyou may not use this file except in compliance with the License.
51e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsYou may obtain a copy of the License at
61e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
71e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    http://www.apache.org/licenses/LICENSE-2.0
81e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
91e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsUnless required by applicable law or agreed to in writing, software
101e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsdistributed under the License is distributed on an "AS IS" BASIS,
111e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
121e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsSee the License for the specific language governing permissions and
131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinslimitations under the License.
141e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins==============================================================================*/
151e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
161e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BACKEND_H_
171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#define TENSORFLOW_COMPILER_XLA_SERVICE_BACKEND_H_
181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
191e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <map>
201e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <memory>
211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <string>
221e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <vector>
231e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
241e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/service/compiler.h"
257d3497a639670d9c31d09185ff97b852f0fbe101HyoukJoong Lee#include "tensorflow/compiler/xla/service/computation_placer.h"
261e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
2761197393ab39929e945e9adf1378659a5c2bbab1A. Unique TensorFlower#include "tensorflow/compiler/xla/service/pool.h"
281e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/service/transfer_manager.h"
291e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/statusor.h"
301e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/types.h"
311e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/lib/gtl/array_slice.h"
321e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/lib/strings/strcat.h"
331e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/platform/mutex.h"
341e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/platform/stream_executor_no_cuda.h"
351e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/platform/thread_annotations.h"
361e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
371e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsnamespace Eigen {
38b4d091d5a372f97af48192cb431985b20b447158Peter Hawkinsstruct ThreadPoolDevice;
391e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
401e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsnamespace xla {
421e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
43a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlower// Options to configure the backend when it is created.
44a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlowerclass BackendOptions {
45a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlower public:
46a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlower  // Set the platform backing the backend, or nullptr for the default platform.
47a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlower  BackendOptions& set_platform(perftools::gputools::Platform* platform);
48a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlower  perftools::gputools::Platform* platform() const;
49a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlower
50a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlower  // Sets the thread pool size for parallel execution of an individual operator.
51a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlower  // The default value of -1 will result in initializing the thread pool with
52a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlower  // the number of threads equal to the number of cores in the system.
53a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlower  BackendOptions& set_intra_op_parallelism_threads(int num_threads);
54a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlower  int intra_op_parallelism_threads() const;
55a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlower
56a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlower private:
57a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlower  perftools::gputools::Platform* platform_ = nullptr;
58a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlower  int intra_op_parallelism_threads_ = -1;
59a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlower};
60a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlower
611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Class which encapsulates an XLA backend. It includes everything necessary
621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// to compile and execute computations on a particular platform.
631e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins//
641e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// It also offers a pooling API for creation/use of initialized streams:
651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins//
6661197393ab39929e945e9adf1378659a5c2bbab1A. Unique TensorFlower//    StreamPtr stream = backend->BorrowStream().ConsumeValueOrDie();
671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsclass Backend {
681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins public:
6961197393ab39929e945e9adf1378659a5c2bbab1A. Unique TensorFlower  using StreamPtr = Pool<perftools::gputools::Stream>::SmartPtr;
7061197393ab39929e945e9adf1378659a5c2bbab1A. Unique TensorFlower
7135af7113de0f15360246234f76e5dda5e927c556Eli Bendersky  // Creates a new backend.
721e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  static StatusOr<std::unique_ptr<Backend>> CreateBackend(
73a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlower      const BackendOptions& options);
741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Creates a backend for the default platform. The default platform is defined
761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // in PlatformUtil.
771e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  static StatusOr<std::unique_ptr<Backend>> CreateDefaultBackend();
781e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
791e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  ~Backend();
801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
811e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Accessors for the various objects.
821e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  perftools::gputools::Platform* platform() const { return platform_; }
831e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  Compiler* compiler() const { return compiler_; }
841e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  DeviceMemoryAllocator* memory_allocator() const {
851e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    return memory_allocator_.get();
861e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
871e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  TransferManager* transfer_manager() const { return transfer_manager_; }
887d3497a639670d9c31d09185ff97b852f0fbe101HyoukJoong Lee  ComputationPlacer* computation_placer() const { return computation_placer_; }
891e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
901e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Returns the number of devices of the platform type which are visible. Not
911e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // all of these devices may be usable by XLA.
921e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  int device_count() const { return stream_executors_.size(); }
931e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
941e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Returns the device ordinal number of the default device.
951e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  int default_device_ordinal() const;
961e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
971e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Returns stream executors of all supported devices for this backend. The
981e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // executors are ordered by the device ordinal.
991e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  const std::vector<perftools::gputools::StreamExecutor*>& stream_executors()
1001e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      const {
1011e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    return stream_executors_;
1021e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
1031e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1047d3497a639670d9c31d09185ff97b852f0fbe101HyoukJoong Lee  // Returns the stream executor for the given device ordinal.
1051e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  StatusOr<perftools::gputools::StreamExecutor*> stream_executor(
1061e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      int device_ordinal) const;
1071e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1087d3497a639670d9c31d09185ff97b852f0fbe101HyoukJoong Lee  // Returns the stream executor for the default device ordinal. This stream
1097d3497a639670d9c31d09185ff97b852f0fbe101HyoukJoong Lee  // executor can only be used when the number of computations is 1 (replication
1107d3497a639670d9c31d09185ff97b852f0fbe101HyoukJoong Lee  // can be > 1).
1111e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  perftools::gputools::StreamExecutor* default_stream_executor() const {
1121e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    CHECK(!stream_executors_.empty());
1131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    return stream_executors_[0];
1141e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
1151e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
11661197393ab39929e945e9adf1378659a5c2bbab1A. Unique TensorFlower  // Borrows a stream for use by the caller, either by grabbing it from an
1171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // internal pool, or by constructing/initializating it, and returns the result
1181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // to the caller.
119112a534b50c0a23dec95382941ac0556f2866b29A. Unique TensorFlower  StatusOr<StreamPtr> BorrowStream(int device_ordinal);
12061197393ab39929e945e9adf1378659a5c2bbab1A. Unique TensorFlower  StatusOr<StreamPtr> BorrowStream(
1211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      perftools::gputools::StreamExecutor* executor);
1221e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
123112a534b50c0a23dec95382941ac0556f2866b29A. Unique TensorFlower  // Returns a function to borrow a stream, as `BorrowStream` above does.
124112a534b50c0a23dec95382941ac0556f2866b29A. Unique TensorFlower  // Purely for convenience, the caller could rather make this anonymous
125112a534b50c0a23dec95382941ac0556f2866b29A. Unique TensorFlower  // function itself.
126112a534b50c0a23dec95382941ac0556f2866b29A. Unique TensorFlower  std::function<StatusOr<StreamPtr>(int)> StreamBorrower() {
127112a534b50c0a23dec95382941ac0556f2866b29A. Unique TensorFlower    return [this](int device_ordinal) { return BorrowStream(device_ordinal); };
128112a534b50c0a23dec95382941ac0556f2866b29A. Unique TensorFlower  }
129112a534b50c0a23dec95382941ac0556f2866b29A. Unique TensorFlower
1301e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Returns whether the given device ordinal of the backend is supported.
1311e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  bool device_ordinal_supported(int device_ordinal) const {
1321e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    return (device_ordinal >= 0 && device_ordinal < device_count() &&
1331e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins            stream_executors_[device_ordinal] != nullptr);
1341e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
1351e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1361e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Return a string identifier for the given device, eg: "GPU:3".
1371e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  string device_name(int device_ordinal) const {
1381e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    return tensorflow::strings::StrCat(platform_->Name(), ":", device_ordinal);
1391e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
1401e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Returns true if the devices with the given ordinals are equivalent from
1421e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // XLA's perspective. That is, an executable compiled for one device would
1431e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // be equivalent to an executable compiled for the other.
1441e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  StatusOr<bool> devices_equivalent(int device_ordinal_a, int device_ordinal_b);
1451e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1461e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // For the host platform, returns the threadpool to use when scheduling
1471e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // parallel operators. For other platforms, returns NULL.
1481e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  tensorflow::thread::ThreadPool* inter_op_thread_pool() const;
1491e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1501e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // For the host platform, returns the configured eigen threadpool device to be
1511e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // used for scheduling work. For other platforms, returns NULL.
1521e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  const Eigen::ThreadPoolDevice* eigen_intra_op_thread_pool_device() const;
153a20ebced22db1be959cdc9875f1a797fd3367712A. Unique TensorFlower  tensorflow::thread::ThreadPool* eigen_intra_op_thread_pool() const;
1541e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
15599e1b19ceba32b8354dddc2841b81864c9ba96bbJacques Pienaar  // Resets the devices associated with this backend.
15699e1b19ceba32b8354dddc2841b81864c9ba96bbJacques Pienaar  Status ResetDevices();
15799e1b19ceba32b8354dddc2841b81864c9ba96bbJacques Pienaar
1581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins private:
1591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  struct EigenThreadPoolWrapper;
16035af7113de0f15360246234f76e5dda5e927c556Eli Bendersky  Backend(perftools::gputools::Platform* platform, Compiler* compiler,
1611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins          tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
1621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins              stream_executors,
1637d3497a639670d9c31d09185ff97b852f0fbe101HyoukJoong Lee          TransferManager* transfer_manager,
1647d3497a639670d9c31d09185ff97b852f0fbe101HyoukJoong Lee          ComputationPlacer* computation_placer,
1657d3497a639670d9c31d09185ff97b852f0fbe101HyoukJoong Lee          int intra_op_parallelism_threads);
1661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  Backend(const Backend&) = delete;
1671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  Backend& operator=(const Backend&) = delete;
1681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1691e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  perftools::gputools::Platform* platform_;
1701e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  Compiler* compiler_;
1711e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  TransferManager* transfer_manager_;
1727d3497a639670d9c31d09185ff97b852f0fbe101HyoukJoong Lee  ComputationPlacer* computation_placer_;
1731e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Vector of stream executors. stream_executors_[0] is the default executor.
1751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  std::vector<perftools::gputools::StreamExecutor*> stream_executors_;
1761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
177414470329b203158a7ac670e99d73e7d04dbd724Jacques Pienaar  tensorflow::mutex mu_;
178414470329b203158a7ac670e99d73e7d04dbd724Jacques Pienaar
17961197393ab39929e945e9adf1378659a5c2bbab1A. Unique TensorFlower  // Mapping from stream executor to stream pools, used by `BorrowStream` above.
1801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  std::map<perftools::gputools::StreamExecutor*,
18161197393ab39929e945e9adf1378659a5c2bbab1A. Unique TensorFlower           Pool<perftools::gputools::Stream>>
182414470329b203158a7ac670e99d73e7d04dbd724Jacques Pienaar      stream_pools_ GUARDED_BY(mu_);
1831e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1841e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // The default memory allocator to use.
1851e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;
1861e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1871e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // For the CPU backend, a threadpool for scheduling parallel operators.
1881e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  std::unique_ptr<tensorflow::thread::ThreadPool> inter_op_thread_pool_;
1891e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1901e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // For the CPU backend, an Eigen threadpool device for use by Eigen code.
1911e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  std::unique_ptr<EigenThreadPoolWrapper> intra_op_thread_pool_wrapper_;
1921e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins};
1931e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1941e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}  // namespace xla
1951e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1961e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_BACKEND_H_
197