11e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
21e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
31e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsLicensed under the Apache License, Version 2.0 (the "License");
41e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsyou may not use this file except in compliance with the License.
51e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsYou may obtain a copy of the License at
61e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
71e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    http://www.apache.org/licenses/LICENSE-2.0
81e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
91e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsUnless required by applicable law or agreed to in writing, software
101e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsdistributed under the License is distributed on an "AS IS" BASIS,
111e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
121e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsSee the License for the specific language governing permissions and
131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinslimitations under the License.
141e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins==============================================================================*/
151e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
161e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <map>
191e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <memory>
201e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <string>
211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <utility>
221e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
231e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/ptr_util.h"
241e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h"
251e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h"
261e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
27abbb19bb9445ffee96ff2946083a3b5c8dadc0d0Eli Bendersky#include "tensorflow/compiler/xla/status_macros.h"
281e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/util.h"
291e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3034cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/ADT/STLExtras.h"
3134cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/ADT/StringMap.h"
3234cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/ADT/StringSet.h"
3334cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Analysis/TargetLibraryInfo.h"
3434cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Analysis/TargetTransformInfo.h"
3534cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Bitcode/BitcodeReader.h"
3634cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Bitcode/BitcodeWriter.h"
374e9fa6dcce4912a4797c48f4cb55d3564961bfcaA. Unique TensorFlower#include "llvm/CodeGen/CommandFlags.def"
3834cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/IR/LLVMContext.h"
3934cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/IR/LegacyPassManager.h"
4034cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/IR/Module.h"
4169db15cc49f84baaff27a19d792102876e811c97A. Unique TensorFlower#include "llvm/IR/Verifier.h"
4234cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Linker/Linker.h"
4334cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/PassRegistry.h"
4434cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Support/CommandLine.h"
4534cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Support/FileSystem.h"
4634cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Support/FormattedStream.h"
4734cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Support/TargetRegistry.h"
4834cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Support/TargetSelect.h"
4934cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Support/ToolOutputFile.h"
5034cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Target/TargetMachine.h"
5134cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Transforms/IPO.h"
5234cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Transforms/IPO/AlwaysInliner.h"
5334cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Transforms/IPO/Internalize.h"
5469db15cc49f84baaff27a19d792102876e811c97A. Unique TensorFlower#include "llvm/Transforms/IPO/PassManagerBuilder.h"
5569db15cc49f84baaff27a19d792102876e811c97A. Unique TensorFlower#include "llvm/Transforms/Scalar.h"
561e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/types.h"
571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/lib/core/stringpiece.h"
581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/lib/io/path.h"
591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/lib/strings/str_util.h"
601e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/lib/strings/stringprintf.h"
611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/platform/env.h"
621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/platform/logging.h"
630bd46f52dad251846996bf440177128a16d429c2Artem Belevich#include "tensorflow/core/platform/tracing.h"
641e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsnamespace xla {
661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsnamespace gpu {
671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsnamespace {
681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
691e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Default inline threshold value to use in llvm.
701e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsconst int kDefaultInlineThreshold = 1100;
711e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
727b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar// Gets the libdevice filename for a particular compute capability.  When
737b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar// presented with a GPU we don't recognize, we just return the libdevice from
747b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar// compute_20.
75a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyenstatic string GetLibdeviceFilename(const string& libdevice_dir_path,
76a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen                                   std::pair<int, int> compute_capability) {
77a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen  // Since CUDA 9.0, all GPU versions are included in a single file
78a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen  const char* unified_libdevice_filename = "libdevice.10.bc";
79a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen  std::vector<string> unified_libdevice_files;
80191825e63f341a4e7777b85254f616e541000d5cA. Unique TensorFlower  const tensorflow::Status status =
81a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen    tensorflow::Env::Default()->GetMatchingPaths(
82a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen      tensorflow::io::JoinPath(libdevice_dir_path, unified_libdevice_filename),
83a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen      &unified_libdevice_files);
84a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen  if (status.ok() && unified_libdevice_files.size() == 1) {
85a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen    return unified_libdevice_filename;
86a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen  }
877b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  // There are only four libdevice files: compute_{20,30,35,50}.  Each GPU
887b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  // version gets mapped to one of these.  Note in particular that sm_60 and
897b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  // sm_61 map to libdevice.compute_30.
907b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  static auto* m = new std::map<std::pair<int, int>, int>({{{2, 0}, 20},
917b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{2, 1}, 20},
927b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{3, 0}, 30},
937b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{3, 2}, 30},
947b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{3, 5}, 35},
957b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{3, 7}, 35},
967b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{5, 0}, 50},
977b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{5, 2}, 50},
987b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{5, 3}, 50},
997b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{6, 0}, 30},
1007b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{6, 1}, 30},
1017b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{6, 2}, 30}});
1027b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  int libdevice_version = 20;
1037b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  auto it = m->find(compute_capability);
1047b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  if (it != m->end()) {
1057b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar    libdevice_version = it->second;
1067b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  } else {
1077b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar    LOG(WARNING) << "Unknown compute capability (" << compute_capability.first
1087b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                 << ", " << compute_capability.second << ") ."
1097b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                 << "Defaulting to libdevice for compute_" << libdevice_version;
1107b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  }
1117b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  return tensorflow::strings::StrCat("libdevice.compute_", libdevice_version,
1127b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                     ".10.bc");
1137b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar}
1147b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar
1157b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar// Gets the GPU name as it's known to LLVM for a given compute capability.  If
116a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen// we see an unrecognized compute capability, we return "sm_30".
1177b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebarstatic string GetSmName(std::pair<int, int> compute_capability) {
1187b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  static auto* m = new std::map<std::pair<int, int>, int>({{{2, 0}, 20},
1197b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{2, 1}, 21},
1207b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{3, 0}, 30},
1217b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{3, 2}, 32},
1227b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{3, 5}, 35},
1237b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{3, 7}, 37},
1247b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{5, 0}, 50},
1257b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{5, 2}, 52},
1267b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{5, 3}, 53},
1277b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{6, 0}, 60},
1287b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                                           {{6, 1}, 61},
129a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen                                                           {{6, 2}, 62},
130a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen                    // TODO: Change this to 70 once LLVM NVPTX supports it
131a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen                                                           {{7, 0}, 60}});
132a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen  int sm_version = 30;
1337b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  auto it = m->find(compute_capability);
1347b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  if (it != m->end()) {
1357b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar    sm_version = it->second;
1367b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  } else {
1377b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar    LOG(WARNING) << "Unknown compute capability (" << compute_capability.first
1387b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                 << ", " << compute_capability.second << ") ."
1397b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                 << "Defaulting to telling LLVM that we're compiling for sm_"
1407b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                 << sm_version;
1411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
1427b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  return tensorflow::strings::StrCat("sm_", sm_version);
1431e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
1441e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1451e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Convenience function for producing a name of a temporary compilation product
1461e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// from the input filename.
1471e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsstring MakeNameForTempProduct(const std::string& input_filename,
1481e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                              tensorflow::StringPiece extension) {
14912efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky  return ReplaceFilenameExtension(
15012efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky      tensorflow::io::Basename(llvm_ir::AsString(input_filename)), extension);
1511e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
1521e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1531e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Initializes LLVM passes. Uses the PassRegistry mechanism.
1541e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsvoid InitializePasses(llvm::PassRegistry* pass_registry) {
1551e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeCore(*pass_registry);
1561e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeCodeGen(*pass_registry);
1571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeScalarOpts(*pass_registry);
1581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeObjCARCOpts(*pass_registry);
1591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeVectorization(*pass_registry);
1601e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeIPO(*pass_registry);
1611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeAnalysis(*pass_registry);
1621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeTransformUtils(*pass_registry);
1631e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeInstCombine(*pass_registry);
1641e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeInstrumentation(*pass_registry);
1651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeTarget(*pass_registry);
1661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeCodeGenPreparePass(*pass_registry);
1671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
1681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1691e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Returns the TargetMachine, given a triple.
1701e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsstd::unique_ptr<llvm::TargetMachine> GetTargetMachine(
171d45505fe0c7ab9a10f16682f54d0eb54c4776cd1Justin Lebar    llvm::Triple triple, tensorflow::StringPiece cpu_name,
172d45505fe0c7ab9a10f16682f54d0eb54c4776cd1Justin Lebar    const HloModuleConfig& hlo_module_config) {
1731e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  std::string error;
1741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  const llvm::Target* target = TargetRegistry::lookupTarget("", triple, error);
1751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (target == nullptr) {
1761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    LOG(FATAL) << "Unable to find Target for triple '" << triple.str() << "'"
1771e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins               << " -- " << error;
1781e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    return nullptr;
1791e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
1801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1811e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  TargetOptions target_options = InitTargetOptionsFromCodeGenFlags();
182abbb19bb9445ffee96ff2946083a3b5c8dadc0d0Eli Bendersky  llvm_ir::SetTargetOptions(
18332e37a1ff2587e02fd35e316c0ac00dcc4e72d17Eli Bendersky      /*fast_math_enabled=*/hlo_module_config.debug_options()
18432e37a1ff2587e02fd35e316c0ac00dcc4e72d17Eli Bendersky          .xla_enable_fast_math(),
185abbb19bb9445ffee96ff2946083a3b5c8dadc0d0Eli Bendersky      &target_options);
186d45505fe0c7ab9a10f16682f54d0eb54c4776cd1Justin Lebar
18712efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky  // Enable FMA synthesis.
18812efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky  target_options.AllowFPOpFusion = FPOpFusion::Fast;
1891e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1901e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Set the verbose assembly options.
19112efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky  target_options.MCOptions.AsmVerbose = false;
1921e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1931e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // The selection of codegen optimization level is copied from function
19434cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa  // GetCodeGenOptLevel in //third_party/llvm/llvm/tools/opt/opt.cpp.
1951e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  CodeGenOpt::Level codegen_opt_level;
19612efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky  switch (hlo_module_config.debug_options().xla_backend_optimization_level()) {
1971e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    case 1:
1981e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      codegen_opt_level = CodeGenOpt::Less;
1991e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      break;
2001e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    case 2:
2011e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      codegen_opt_level = CodeGenOpt::Default;
2021e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      break;
2031e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    case 3:
2041e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      codegen_opt_level = CodeGenOpt::Aggressive;
2051e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      break;
2061e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    default:
2071e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      codegen_opt_level = CodeGenOpt::None;
2081e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
2091e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  return WrapUnique(target->createTargetMachine(
2101e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      triple.str(), llvm_ir::AsStringRef(cpu_name), "+ptx42", target_options,
2113f7c8210fea1a6559b1484b1933f59c1e844fdb1Benjamin Kramer      Optional<Reloc::Model>(RelocModel), Optional<CodeModel::Model>(CMModel),
2123f7c8210fea1a6559b1484b1933f59c1e844fdb1Benjamin Kramer      codegen_opt_level));
2131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
2141e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2151e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Adds the standard LLVM optimization passes, based on the speed optimization
2161e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// level (opt_level) and size optimization level (size_level). Both module
2171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// and function-level passes are added, so two pass managers are passed in and
2181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// modified by this function.
2191e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsvoid AddOptimizationPasses(unsigned opt_level, unsigned size_level,
2201e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                           llvm::TargetMachine* target_machine,
2211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                           llvm::legacy::PassManagerBase* module_passes,
2221e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                           llvm::legacy::FunctionPassManager* function_passes) {
2231e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  PassManagerBuilder builder;
2241e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  builder.OptLevel = opt_level;
2251e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  builder.SizeLevel = size_level;
2261e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2271e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (opt_level > 1) {
2281e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    builder.Inliner = llvm::createFunctionInliningPass(kDefaultInlineThreshold);
2291e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  } else {
2301e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // Only inline functions marked with "alwaysinline".
2311e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    builder.Inliner = llvm::createAlwaysInlinerLegacyPass();
2321e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
2331e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2341e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  builder.DisableUnitAtATime = false;
2351e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  builder.DisableUnrollLoops = opt_level == 0;
2361e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  builder.LoopVectorize = opt_level > 0;
2371e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  builder.SLPVectorize = opt_level > 1 && size_level < 2;
2381e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2391e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // NVPTX's early-as-possible passes include NVVM reflect.
24095c7dfc16063c77871257b4da8d8958c731d19c2A. Unique TensorFlower  target_machine->adjustPassManager(builder);
2411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2421e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  builder.populateFunctionPassManager(*function_passes);
2431e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  builder.populateModulePassManager(*module_passes);
2441e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
2451e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2461e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Emits the given module to a bit code file.
2471e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsvoid EmitBitcodeToFile(const Module& module, tensorflow::StringPiece filename) {
2481e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  std::error_code error_code;
249321a2ebdf39b3e779a9c677aa6c1249a7aa70f49Benjamin Kramer  llvm::ToolOutputFile outfile(filename.ToString().c_str(), error_code,
250321a2ebdf39b3e779a9c677aa6c1249a7aa70f49Benjamin Kramer                               llvm::sys::fs::F_None);
2511e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (error_code) {
2521e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    LOG(FATAL) << "opening bitcode file for writing: " << error_code.message();
2531e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
2541e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
255f77256a164ccb173a85472286311644db11ae5b1Benjamin Kramer  llvm::WriteBitcodeToFile(module, outfile.os());
2561e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  outfile.keep();
2571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
2581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Emits the given module to PTX. target_machine is an initialized TargetMachine
2601e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// for the NVPTX target.
2611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsstring EmitModuleToPTX(Module* module, llvm::TargetMachine* target_machine) {
2621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  std::string ptx;  // need a std::string instead of a ::string.
2631e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  {
2641e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    llvm::raw_string_ostream stream(ptx);
2651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    llvm::buffer_ostream pstream(stream);
2661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // The extension is stripped by IrDumpingPassManager, so we need to
2671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // get creative to add a suffix.
2681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    string module_id(llvm_ir::AsString(module->getModuleIdentifier()));
2691e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    IrDumpingPassManager codegen_passes(
2701e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins        ReplaceFilenameExtension(tensorflow::io::Basename(module_id),
2711e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                                 "-nvptx.dummy"),
27212efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky        "", false);
2731e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
2741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins        llvm::Triple(module->getTargetTriple())));
2751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    target_machine->addPassesToEmitFile(codegen_passes, pstream,
2771e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                                        llvm::TargetMachine::CGFT_AssemblyFile);
2781e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    codegen_passes.run(*module);
2791e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
2801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2811e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  return ptx;
2821e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
2831e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2841e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// LLVM has an extensive flags mechanism of its own, which is only accessible
2851e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// through the command line. Internal libraries within LLVM register parsers for
2861e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// flags, with no other way to configure them except pass these flags.
2871e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// To do this programmatically, we invoke ParseCommandLineOptions manually with
2881e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// a "fake argv".
2891e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Note: setting flags with this method is stateful, since flags are just
2901e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// static globals within LLVM libraries.
2911e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsvoid FeedLLVMWithFlags(const std::vector<string>& cl_opts) {
2921e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  std::vector<const char*> fake_argv = {""};
2931e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  for (const string& cl_opt : cl_opts) {
2941e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    fake_argv.push_back(cl_opt.c_str());
2951e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
2961e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::cl::ParseCommandLineOptions(fake_argv.size(), &fake_argv[0]);
2971e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
2981e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2991e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Returns whether the module could use any libdevice functions. This function
3001e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// may have false positives -- the module might not use libdevice even if this
3011e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// function returns true.
3021e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsbool CouldNeedLibdevice(const llvm::Module& module) {
3031e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  for (const llvm::Function& function : module.functions()) {
3041e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // This is a conservative approximation -- not all such functions are in
3051e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // libdevice.
3061e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    if (!function.isIntrinsic() && function.isDeclaration()) {
3071e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      return true;
3081e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    }
3091e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
3101e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  return false;
3111e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
3121e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Links libdevice into the given module if the module needs libdevice.
3147b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebartensorflow::Status LinkLibdeviceIfNecessary(
3157b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar    llvm::Module* module, std::pair<int, int> compute_capability,
3167b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar    const string& libdevice_dir_path) {
3171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (!CouldNeedLibdevice(*module)) {
3181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    return tensorflow::Status::OK();
3191e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
3201e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::Linker linker(*module);
3227b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  string libdevice_path = tensorflow::io::JoinPath(
323a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen      libdevice_dir_path, GetLibdeviceFilename(libdevice_dir_path,
324a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen                                               compute_capability));
3257b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  TF_RETURN_IF_ERROR(tensorflow::Env::Default()->FileExists(libdevice_path));
3267b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  VLOG(1) << "Linking with libdevice from: " << libdevice_path;
3271e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  std::unique_ptr<llvm::Module> libdevice_module =
3287b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar      LoadIRModule(libdevice_path, &module->getContext());
32900f8415dcada6e416fe67abb99675abbde16845dA. Unique TensorFlower  if (linker.linkInModule(
33000f8415dcada6e416fe67abb99675abbde16845dA. Unique TensorFlower          std::move(libdevice_module), llvm::Linker::Flags::LinkOnlyNeeded,
33100f8415dcada6e416fe67abb99675abbde16845dA. Unique TensorFlower          [](Module& M, const StringSet<>& GVS) {
33200f8415dcada6e416fe67abb99675abbde16845dA. Unique TensorFlower            internalizeModule(M, [&M, &GVS](const GlobalValue& GV) {
33300f8415dcada6e416fe67abb99675abbde16845dA. Unique TensorFlower              return !GV.hasName() || (GVS.count(GV.getName()) == 0);
33400f8415dcada6e416fe67abb99675abbde16845dA. Unique TensorFlower            });
33500f8415dcada6e416fe67abb99675abbde16845dA. Unique TensorFlower          })) {
3367b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar    return tensorflow::errors::Internal(tensorflow::strings::StrCat(
3377b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar        "Error linking libdevice from ", libdevice_path));
3381e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
3391e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  return tensorflow::Status::OK();
3401e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
3411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3421e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsStatusOr<string> CompileModuleToPtx(llvm::Module* module,
3437b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                    std::pair<int, int> compute_capability,
344d45505fe0c7ab9a10f16682f54d0eb54c4776cd1Justin Lebar                                    const HloModuleConfig& hlo_module_config,
3451e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                                    const string& libdevice_dir_path) {
34666fc99a3b53c2e77d1c8569e1597a0094b0f99a8Artem Belevich  // If the module has no functions or globals, there's nothing to compile. Just
34766fc99a3b53c2e77d1c8569e1597a0094b0f99a8Artem Belevich  // return an empty string.
34866fc99a3b53c2e77d1c8569e1597a0094b0f99a8Artem Belevich  if (module->empty() && module->global_empty()) {
34966fc99a3b53c2e77d1c8569e1597a0094b0f99a8Artem Belevich    VLOG(2) << "Module '" << llvm_ir::AsString(module->getName())
35066fc99a3b53c2e77d1c8569e1597a0094b0f99a8Artem Belevich            << "' is empty. Skipping compilation.";
35166fc99a3b53c2e77d1c8569e1597a0094b0f99a8Artem Belevich    return string();
35266fc99a3b53c2e77d1c8569e1597a0094b0f99a8Artem Belevich  }
3531e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Link the input module with libdevice, to pull in implementations of some
3541e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // builtins.
3557b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  TF_RETURN_IF_ERROR(
3567b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar      LinkLibdeviceIfNecessary(module, compute_capability, libdevice_dir_path));
3571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Set the flush-denormals-to-zero flag on the module so the NVVM reflect pass
3591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // can access it.
36012efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky  module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
36112efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky                        hlo_module_config.debug_options().xla_gpu_ftz());
3621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3631e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // If ftz is enabled, set it as an attribute on every function in the module.
36412efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky  if (hlo_module_config.debug_options().xla_gpu_ftz()) {
3651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    for (llvm::Function& fn : *module) {
3661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      fn.addFnAttr("nvptx-f32ftz", "true");
3671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    }
3681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
3691e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
37012efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky  IrDumpingPassManager module_passes(module->getModuleIdentifier(), "", false);
3711e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3721e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Add an appropriate TargetLibraryInfo pass for the module's triple.
3731e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::TargetLibraryInfoWrapperPass* tliwp =
3741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      new llvm::TargetLibraryInfoWrapperPass(
3751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins          llvm::Triple(module->getTargetTriple()));
3761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  module_passes.add(tliwp);
3771e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3781e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Try to fetch the target triple from the module. If not present, set a
3791e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // default target triple.
3801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::Triple target_triple = llvm::Triple(module->getTargetTriple());
3811e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (target_triple.getArch() == llvm::Triple::UnknownArch) {
3821e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    LOG(WARNING) << "target triple not found in the module";
3831e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    target_triple = llvm::Triple("nvptx64-unknown-unknown");
3841e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
3851e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3861e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Figure out the exact name of the processor as known to the NVPTX backend
3871e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // from the gpu_architecture flag.
3887b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar  std::unique_ptr<llvm::TargetMachine> target_machine = GetTargetMachine(
3897b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar      target_triple, GetSmName(compute_capability), hlo_module_config);
3901e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  module_passes.add(llvm::createTargetTransformInfoWrapperPass(
3911e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      target_machine->getTargetIRAnalysis()));
3921e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3931e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // The LLVM IR verifier performs sanity checking on the IR. This helps
3941e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // discover problems and report them in a meaningful manner, rather than let
39553cb26d05a5c2080d8022124178b1cc43a30ffe5A. Unique TensorFlower  // later passes report obscure assertions because of unfulfilled invariants.
3961e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  module_passes.add(llvm::createVerifierPass());
3971e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3981e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Create the function-level pass manager. It needs data layout information
3991e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // too.
4001e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::legacy::FunctionPassManager function_passes(module);
4011e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
40212efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky  int32 opt_level =
40312efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky      hlo_module_config.debug_options().xla_backend_optimization_level();
40412efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky
405ac47dc166f290d631c156846039ac78f30f362afEli Bendersky  CHECK_GE(opt_level, 2)
406ac47dc166f290d631c156846039ac78f30f362afEli Bendersky      << "The XLA GPU backend doesn't support unoptimized code generation";
407ac47dc166f290d631c156846039ac78f30f362afEli Bendersky
40812efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky  AddOptimizationPasses(opt_level,
40912efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky                        /*size_level=*/0, target_machine.get(), &module_passes,
41012efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky                        &function_passes);
41112efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky
4121b5235fd897f7ea5cffc715300f67b4dc852fa27Jonathan Hseu  // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA
4131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // again after the standard optimization passes [http://b/13329423].
414d57572e996dce24abf4d9cf6ea04e7104b3d743bMartin Wicke  // TODO(jingyue): SROA may further expose more optimization opportunities such
4151e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // as more precise alias analysis and more function inlining (SROA may change
4161e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // the inlining cost of a function). For now, running SROA already emits good
4171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // enough code for the evaluated benchmarks. We may want to run more
4181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // optimizations later.
41912efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky  if (opt_level > 0) {
4201e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // LLVM's optimizer turns on SROA when the optimization level is greater
4211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // than 0. We mimic this behavior here.
4221e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    module_passes.add(llvm::createSROAPass());
4231e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
4241e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4251e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Verify that the module is well formed after optimizations ran.
4261e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  module_passes.add(llvm::createVerifierPass());
4271e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4281e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Done populating the pass managers. Now run them.
4291e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4301e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  function_passes.doInitialization();
4311e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  for (auto func = module->begin(); func != module->end(); ++func) {
4321e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    function_passes.run(*func);
4331e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
4341e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  function_passes.doFinalization();
4351e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  module_passes.run(*module);
4361e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4371e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Finally, produce PTX.
4381e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  return EmitModuleToPTX(module, target_machine.get());
4391e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
4401e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// One-time module initializer.
4421e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Must be called only once -- DO NOT CALL DIRECTLY.
4431e934ece7122cc623861a76ec3076f0dfb782225A. Unique TensorFlowervoid GPUBackendInit(const HloModuleConfig& hlo_module_config) {
4441e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Feed all customized flags here, so we can override them with llvm_cl_opts
4451e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // without redeploy the compiler for development purpose.
4461e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4471e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // This flag tunes a threshold in branch folding. The default threshold, which
4481e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // is one, is not suitable for CUDA programs where branches are more expensive
4491e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // than for CPU programs. Setting the threshold to 2 improves the latency of
4501e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // TwoDPatchDotProductKernel_IND_3_ND_48 by over 5%, and does not affect the
4511e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // latency of other benchmarks so far.
4521e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  //
4531e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // I also tried setting this threshold to other values:
4541e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // * 3-6 gives similar results as 2;
4551e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // * >6 start hurting the performance of at least dot product kernels.
4561e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  //
4571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // TODO(jingyue): The current threshold only considers the numbr of IR
4581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // instructions which do not accurately reflect the true cost. We need a
4591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // better cost model.
4601e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  FeedLLVMWithFlags({"-bonus-inst-threshold=2"});
4611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // TODO(b/22073864): Increase limit when scan memory dependency.
4621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // This helps to reduce more redundant load instructions.
4631e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  //
4641e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // The specific value is currently large enough for s3d in shoc benchmark,
4651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // which contains a lot of load instructions and many arithmetic instructions
4661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // between those loads.
4671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  FeedLLVMWithFlags({"-memdep-block-scan-limit=500"});
4681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4691e934ece7122cc623861a76ec3076f0dfb782225A. Unique TensorFlower  llvm_ir::InitializeLLVMCommandLineOptions(hlo_module_config);
4701e934ece7122cc623861a76ec3076f0dfb782225A. Unique TensorFlower
4711e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Initialize the NVPTX target; it's the only target we link with, so call its
4721e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // specific initialization functions instead of the catch-all InitializeAll*.
4731e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  LLVMInitializeNVPTXTarget();
4741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  LLVMInitializeNVPTXTargetInfo();
4751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  LLVMInitializeNVPTXTargetMC();
4761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  LLVMInitializeNVPTXAsmPrinter();
4771e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4781e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Initialize the LLVM optimization passes.
4791e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry();
4801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  InitializePasses(registry);
4811e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
4821e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4831e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}  // namespace
4841e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4851e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsStatusOr<string> CompileToPtx(llvm::Module* module,
4867b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                              std::pair<int, int> compute_capability,
487d45505fe0c7ab9a10f16682f54d0eb54c4776cd1Justin Lebar                              const HloModuleConfig& hlo_module_config,
4881e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                              const string& libdevice_dir_path) {
4891e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  static std::once_flag backend_init_flag;
4901e934ece7122cc623861a76ec3076f0dfb782225A. Unique TensorFlower  std::call_once(backend_init_flag, GPUBackendInit, hlo_module_config);
4911e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4921e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  string ptx;
4931e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  {
4940bd46f52dad251846996bf440177128a16d429c2Artem Belevich    tensorflow::port::Tracing::TraceMe annotation(
4950bd46f52dad251846996bf440177128a16d429c2Artem Belevich        "Compiling IR", llvm_ir::AsString(module->getName()),
4960bd46f52dad251846996bf440177128a16d429c2Artem Belevich        /*is_expensive=*/true);
497b525ea6798175f4c95996a3666c70de5c00a9a0cJustin Lebar    XLA_SCOPED_LOGGING_TIMER("Compile module " +
498b525ea6798175f4c95996a3666c70de5c00a9a0cJustin Lebar                             llvm_ir::AsString(module->getName()));
499d45505fe0c7ab9a10f16682f54d0eb54c4776cd1Justin Lebar    TF_ASSIGN_OR_RETURN(
5007b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar        ptx, CompileModuleToPtx(module, compute_capability, hlo_module_config,
5017b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar                                libdevice_dir_path));
5021e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
5031e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  return ptx;
5041e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
5051e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
5061e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}  // namespace gpu
5071e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}  // namespace xla
508