gpu_backend_lib.cc revision 1e67c90e2caceeff82d09793d1ef5fa0300d219b
11e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
21e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
31e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsLicensed under the Apache License, Version 2.0 (the "License");
41e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsyou may not use this file except in compliance with the License.
51e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsYou may obtain a copy of the License at
61e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
71e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    http://www.apache.org/licenses/LICENSE-2.0
81e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
91e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsUnless required by applicable law or agreed to in writing, software
101e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsdistributed under the License is distributed on an "AS IS" BASIS,
111e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
121e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsSee the License for the specific language governing permissions and
131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinslimitations under the License.
141e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins==============================================================================*/
151e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
161e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <map>
191e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <memory>
201e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <string>
211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <utility>
221e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
231e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.h"
241e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/ptr_util.h"
251e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h"
261e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h"
271e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
281e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/util.h"
291e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
301e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/ADT/STLExtras.h"
311e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/ADT/StringMap.h"
321e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/ADT/StringSet.h"
331e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Analysis/TargetLibraryInfo.h"
341e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Analysis/TargetTransformInfo.h"
351e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Bitcode/BitcodeReader.h"
361e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Bitcode/BitcodeWriter.h"
371e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/CodeGen/CommandFlags.h"
381e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/IR/LLVMContext.h"
391e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/IR/LegacyPassManager.h"
401e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/IR/Module.h"
411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/LinkAllIR.h"
421e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/LinkAllPasses.h"
431e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Linker/Linker.h"
441e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/PassRegistry.h"
451e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Support/CommandLine.h"
461e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Support/FileSystem.h"
471e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Support/FormattedStream.h"
481e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Support/TargetRegistry.h"
491e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Support/TargetSelect.h"
501e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Support/ToolOutputFile.h"
511e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Target/TargetMachine.h"
521e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Transforms/IPO.h"
531e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Transforms/IPO/AlwaysInliner.h"
541e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h"
551e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
561e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/types.h"
571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/lib/core/stringpiece.h"
581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/lib/io/path.h"
591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/lib/strings/str_util.h"
601e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/lib/strings/stringprintf.h"
611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/platform/env.h"
621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/platform/logging.h"
631e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
641e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsnamespace xla {
651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsnamespace gpu {
661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsnamespace {
671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Default inline threshold value to use in llvm.
691e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsconst int kDefaultInlineThreshold = 1100;
701e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
711e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Information about a GPU architecture for the backend.
721e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsstruct GpuBackendInfo {
731e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  string libdevice_name;
741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  string sm_name;
751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins};
761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
771e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Maps supported CUDA compute capability to a libdevice file to link for this
781e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// capability.
791e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsstd::map<string, GpuBackendInfo> gpu_info_map = {
801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    {"compute_20", {"libdevice.compute_20.10.bc", "sm_20"}},
811e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    {"compute_30", {"libdevice.compute_30.10.bc", "sm_30"}},
821e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    {"compute_35", {"libdevice.compute_35.10.bc", "sm_35"}},
831e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
841e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // NVIDIA does not provide a separate libdevice for CC 3.7, but we can use
851e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // the one for 3.5.
861e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    {"compute_37", {"libdevice.compute_35.10.bc", "sm_37"}},
871e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins};
881e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
891e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Validate the --gpu_architecture command-line flag.
901e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsstatic void ValidateGPUArchitecture(const string& value) {
911e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (!gpu_info_map.count(value)) {
921e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    LOG(FATAL) << "value for --gpu_architecture must be compute_{20,30,35,37}";
931e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
941e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
951e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
961e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Convenience function for producing a name of a temporary compilation product
971e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// from the input filename.
981e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsstring MakeNameForTempProduct(const std::string& input_filename,
991e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                              tensorflow::StringPiece extension) {
1001e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  legacy_flags::GpuBackendLibFlags* flags =
1011e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      legacy_flags::GetGpuBackendLibFlags();
1021e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  return tensorflow::io::JoinPath(
1031e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      flags->dump_temp_products_to,
1041e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      ReplaceFilenameExtension(
1051e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins          tensorflow::io::Basename(llvm_ir::AsString(input_filename)),
1061e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins          extension));
1071e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
1081e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1091e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Initializes LLVM passes. Uses the PassRegistry mechanism.
1101e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsvoid InitializePasses(llvm::PassRegistry* pass_registry) {
1111e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeCore(*pass_registry);
1121e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeCodeGen(*pass_registry);
1131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeScalarOpts(*pass_registry);
1141e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeObjCARCOpts(*pass_registry);
1151e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeVectorization(*pass_registry);
1161e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeIPO(*pass_registry);
1171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeAnalysis(*pass_registry);
1181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeTransformUtils(*pass_registry);
1191e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeInstCombine(*pass_registry);
1201e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeInstrumentation(*pass_registry);
1211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeTarget(*pass_registry);
1221e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::initializeCodeGenPreparePass(*pass_registry);
1231e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
1241e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1251e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Returns the TargetMachine, given a triple.
1261e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsstd::unique_ptr<llvm::TargetMachine> GetTargetMachine(
1271e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    llvm::Triple triple, tensorflow::StringPiece cpu_name) {
1281e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  std::string error;
1291e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  const llvm::Target* target = TargetRegistry::lookupTarget("", triple, error);
1301e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (target == nullptr) {
1311e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    LOG(FATAL) << "Unable to find Target for triple '" << triple.str() << "'"
1321e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins               << " -- " << error;
1331e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    return nullptr;
1341e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
1351e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1361e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  TargetOptions target_options = InitTargetOptionsFromCodeGenFlags();
1371e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Enable FMA synthesis if desired.
1381e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  legacy_flags::GpuBackendLibFlags* flags =
1391e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      legacy_flags::GetGpuBackendLibFlags();
1401e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (flags->fma) {
1411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    target_options.AllowFPOpFusion = FPOpFusion::Fast;
1421e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
1431e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1441e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Set options from LlvmBackendFlags (specifically, fast-math flags).
1451e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm_ir::SetTargetOptions(&target_options);
1461e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1471e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Set the verbose assembly options.
1481e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  target_options.MCOptions.AsmVerbose = flags->verbose_ptx_asm;
1491e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1501e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // The selection of codegen optimization level is copied from function
1511e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // GetCodeGenOptLevel in //external/llvm/tools/opt/opt.cpp.
1521e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  CodeGenOpt::Level codegen_opt_level;
1531e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  switch (flags->opt_level) {
1541e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    case 1:
1551e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      codegen_opt_level = CodeGenOpt::Less;
1561e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      break;
1571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    case 2:
1581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      codegen_opt_level = CodeGenOpt::Default;
1591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      break;
1601e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    case 3:
1611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      codegen_opt_level = CodeGenOpt::Aggressive;
1621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      break;
1631e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    default:
1641e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      codegen_opt_level = CodeGenOpt::None;
1651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
1661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  return WrapUnique(target->createTargetMachine(
1671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      triple.str(), llvm_ir::AsStringRef(cpu_name), "+ptx42", target_options,
1681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      Optional<Reloc::Model>(RelocModel), CMModel, codegen_opt_level));
1691e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
1701e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1711e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Adds the standard LLVM optimization passes, based on the speed optimization
1721e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// level (opt_level) and size optimization level (size_level). Both module
1731e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// and function-level passes are added, so two pass managers are passed in and
1741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// modified by this function.
1751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsvoid AddOptimizationPasses(unsigned opt_level, unsigned size_level,
1761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                           llvm::TargetMachine* target_machine,
1771e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                           llvm::legacy::PassManagerBase* module_passes,
1781e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                           llvm::legacy::FunctionPassManager* function_passes) {
1791e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  PassManagerBuilder builder;
1801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  builder.OptLevel = opt_level;
1811e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  builder.SizeLevel = size_level;
1821e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1831e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (opt_level > 1) {
1841e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    builder.Inliner = llvm::createFunctionInliningPass(kDefaultInlineThreshold);
1851e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  } else {
1861e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // Only inline functions marked with "alwaysinline".
1871e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    builder.Inliner = llvm::createAlwaysInlinerLegacyPass();
1881e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
1891e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1901e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  builder.DisableUnitAtATime = false;
1911e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  builder.DisableUnrollLoops = opt_level == 0;
1921e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  builder.LoopVectorize = opt_level > 0;
1931e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  builder.SLPVectorize = opt_level > 1 && size_level < 2;
1941e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
1951e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // NVPTX's early-as-possible passes include NVVM reflect.
1961e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  builder.addExtension(
1971e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      llvm::PassManagerBuilder::EP_EarlyAsPossible,
1981e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      [&](const PassManagerBuilder&, legacy::PassManagerBase& pass_manager) {
1991e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins        target_machine->addEarlyAsPossiblePasses(pass_manager);
2001e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      });
2011e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2021e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  builder.populateFunctionPassManager(*function_passes);
2031e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  builder.populateModulePassManager(*module_passes);
2041e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
2051e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2061e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Emits the given module to a bit code file.
2071e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsvoid EmitBitcodeToFile(const Module& module, tensorflow::StringPiece filename) {
2081e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  std::error_code error_code;
2091e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::tool_output_file outfile(filename.ToString().c_str(), error_code,
2101e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                                 llvm::sys::fs::F_None);
2111e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (error_code) {
2121e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    LOG(FATAL) << "opening bitcode file for writing: " << error_code.message();
2131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
2141e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2151e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::WriteBitcodeToFile(&module, outfile.os());
2161e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  outfile.keep();
2171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
2181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2191e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Emits the given module to PTX. target_machine is an initialized TargetMachine
2201e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// for the NVPTX target.
2211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsstring EmitModuleToPTX(Module* module, llvm::TargetMachine* target_machine) {
2221e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  std::string ptx;  // need a std::string instead of a ::string.
2231e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  {
2241e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    llvm::raw_string_ostream stream(ptx);
2251e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    llvm::buffer_ostream pstream(stream);
2261e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // The extension is stripped by IrDumpingPassManager, so we need to
2271e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // get creative to add a suffix.
2281e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    string module_id(llvm_ir::AsString(module->getModuleIdentifier()));
2291e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    legacy_flags::GpuBackendLibFlags* flags =
2301e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins        legacy_flags::GetGpuBackendLibFlags();
2311e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    IrDumpingPassManager codegen_passes(
2321e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins        ReplaceFilenameExtension(tensorflow::io::Basename(module_id),
2331e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                                 "-nvptx.dummy"),
2341e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins        flags->dump_temp_products_to, flags->dump_ir_before_passes);
2351e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
2361e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins        llvm::Triple(module->getTargetTriple())));
2371e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2381e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    target_machine->addPassesToEmitFile(codegen_passes, pstream,
2391e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                                        llvm::TargetMachine::CGFT_AssemblyFile);
2401e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    codegen_passes.run(*module);
2411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
2421e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2431e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  return ptx;
2441e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
2451e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2461e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// LLVM has an extensive flags mechanism of its own, which is only accessible
2471e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// through the command line. Internal libraries within LLVM register parsers for
2481e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// flags, with no other way to configure them except pass these flags.
2491e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// To do this programmatically, we invoke ParseCommandLineOptions manually with
2501e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// a "fake argv".
2511e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Note: setting flags with this method is stateful, since flags are just
2521e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// static globals within LLVM libraries.
2531e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsvoid FeedLLVMWithFlags(const std::vector<string>& cl_opts) {
2541e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  std::vector<const char*> fake_argv = {""};
2551e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  for (const string& cl_opt : cl_opts) {
2561e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    fake_argv.push_back(cl_opt.c_str());
2571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
2581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::cl::ParseCommandLineOptions(fake_argv.size(), &fake_argv[0]);
2591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
2601e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsnamespace {
2621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Returns whether the module could use any libdevice functions. This function
2631e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// may have false positives -- the module might not use libdevice even if this
2641e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// function returns true.
2651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsbool CouldNeedLibdevice(const llvm::Module& module) {
2661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  for (const llvm::Function& function : module.functions()) {
2671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // This is a conservative approximation -- not all such functions are in
2681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // libdevice.
2691e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    if (!function.isIntrinsic() && function.isDeclaration()) {
2701e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      return true;
2711e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    }
2721e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
2731e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  return false;
2741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
2751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Links libdevice into the given module if the module needs libdevice.
2771e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinstensorflow::Status LinkLibdeviceIfNecessary(const string& libdevice_dir_path,
2781e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                                            llvm::Module* module) {
2791e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (!CouldNeedLibdevice(*module)) {
2801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    return tensorflow::Status::OK();
2811e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
2821e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
2831e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::Linker linker(*module);
2841e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  legacy_flags::GpuBackendLibFlags* flags =
2851e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      legacy_flags::GetGpuBackendLibFlags();
2861e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  ValidateGPUArchitecture(flags->gpu_architecture);
2871e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  string libdevice_bc_filename =
2881e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      gpu_info_map[flags->gpu_architecture].libdevice_name;
2891e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  string libdevice_bc_fullpath =
2901e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      tensorflow::io::JoinPath(libdevice_dir_path, libdevice_bc_filename);
2911e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  TF_RETURN_IF_ERROR(
2921e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      tensorflow::Env::Default()->FileExists(libdevice_bc_fullpath));
2931e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  std::unique_ptr<llvm::Module> libdevice_module =
2941e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      LoadIRModule(libdevice_bc_fullpath, &module->getContext());
2951e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  VLOG(1) << "Linking with libdevice from: " << libdevice_bc_fullpath;
2961e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (linker.linkInModule(std::move(libdevice_module),
2971e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                          llvm::Linker::Flags::InternalizeLinkedSymbols |
2981e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                              llvm::Linker::Flags::LinkOnlyNeeded)) {
2991e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    LOG(FATAL) << "Error linking libdevice from " << libdevice_bc_fullpath;
3001e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
3011e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  return tensorflow::Status::OK();
3021e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
3031e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3041e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}  // namespace
3051e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3061e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsStatusOr<string> CompileModuleToPtx(llvm::Module* module,
3071e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                                    const string& libdevice_dir_path) {
3081e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Link the input module with libdevice, to pull in implementations of some
3091e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // builtins.
3101e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  TF_RETURN_IF_ERROR(LinkLibdeviceIfNecessary(libdevice_dir_path, module));
3111e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3121e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  legacy_flags::GpuBackendLibFlags* flags =
3131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      legacy_flags::GetGpuBackendLibFlags();
3141e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (!flags->dump_temp_products_to.empty()) {
3151e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    string linked_filename =
3161e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins        MakeNameForTempProduct(module->getModuleIdentifier(), "linked.bc");
3171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    LOG(INFO) << "dumping bitcode after linking libdevice to: "
3181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins              << linked_filename;
3191e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    EmitBitcodeToFile(*module, linked_filename);
3201e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
3211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3221e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Set the flush-denormals-to-zero flag on the module so the NVVM reflect pass
3231e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // can access it.
3241e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", flags->ftz);
3251e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3261e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // If ftz is enabled, set it as an attribute on every function in the module.
3271e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (flags->ftz) {
3281e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    for (llvm::Function& fn : *module) {
3291e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      fn.addFnAttr("nvptx-f32ftz", "true");
3301e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    }
3311e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
3321e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3331e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Run IR-level optimizations.
3341e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (flags->dump_ir_before_passes && flags->dump_temp_products_to.empty()) {
3351e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    LOG(FATAL) << "--dump_ir_before_passes must be specified with "
3361e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                  "--dump_temp_products_to";
3371e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
3381e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3391e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  IrDumpingPassManager module_passes(module->getModuleIdentifier(),
3401e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                                     flags->dump_temp_products_to,
3411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                                     flags->dump_ir_before_passes);
3421e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3431e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Add an appropriate TargetLibraryInfo pass for the module's triple.
3441e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::TargetLibraryInfoWrapperPass* tliwp =
3451e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      new llvm::TargetLibraryInfoWrapperPass(
3461e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins          llvm::Triple(module->getTargetTriple()));
3471e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  module_passes.add(tliwp);
3481e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3491e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Try to fetch the target triple from the module. If not present, set a
3501e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // default target triple.
3511e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::Triple target_triple = llvm::Triple(module->getTargetTriple());
3521e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (target_triple.getArch() == llvm::Triple::UnknownArch) {
3531e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    LOG(WARNING) << "target triple not found in the module";
3541e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    target_triple = llvm::Triple("nvptx64-unknown-unknown");
3551e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
3561e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Figure out the exact name of the processor as known to the NVPTX backend
3581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // from the gpu_architecture flag.
3591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  ValidateGPUArchitecture(flags->gpu_architecture);
3601e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  string cpu_name = gpu_info_map[flags->gpu_architecture].sm_name;
3611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  std::unique_ptr<llvm::TargetMachine> target_machine =
3631e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      GetTargetMachine(target_triple, cpu_name);
3641e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  module_passes.add(llvm::createTargetTransformInfoWrapperPass(
3651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      target_machine->getTargetIRAnalysis()));
3661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // The LLVM IR verifier performs sanity checking on the IR. This helps
3681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // discover problems and report them in a meaningful manner, rather than let
3691e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // later passes report obscure assertions becasue of unfulfilled invariants.
3701e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  module_passes.add(llvm::createVerifierPass());
3711e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3721e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Create the function-level pass manager. It needs data layout information
3731e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // too.
3741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::legacy::FunctionPassManager function_passes(module);
3751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  AddOptimizationPasses(flags->opt_level, /*size_level=*/0,
3771e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                        target_machine.get(), &module_passes, &function_passes);
3781e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Loop unrolling exposes more opportunites for SROA. Therefore, we run SROA
3791e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // again after the standard optimization passes [http://b/13329423].
3801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // TODO(jingyue): SROA may further expose more optimization opportunites, such
3811e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // as more precise alias analysis and more function inlining (SROA may change
3821e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // the inlining cost of a function). For now, running SROA already emits good
3831e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // enough code for the evaluated benchmarks. We may want to run more
3841e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // optimizations later.
3851e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (flags->opt_level > 0) {
3861e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // LLVM's optimizer turns on SROA when the optimization level is greater
3871e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // than 0. We mimic this behavior here.
3881e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    module_passes.add(llvm::createSROAPass());
3891e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
3901e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3911e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Verify that the module is well formed after optimizations ran.
3921e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  module_passes.add(llvm::createVerifierPass());
3931e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3941e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Done populating the pass managers. Now run them.
3951e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
3961e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  function_passes.doInitialization();
3971e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  for (auto func = module->begin(); func != module->end(); ++func) {
3981e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    function_passes.run(*func);
3991e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
4001e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  function_passes.doFinalization();
4011e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  module_passes.run(*module);
4021e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4031e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (!flags->dump_temp_products_to.empty()) {
4041e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    string optimized_filename =
4051e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins        MakeNameForTempProduct(module->getModuleIdentifier(), "optimized.bc");
4061e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    LOG(INFO) << "dumping bitcode after optimizations to: "
4071e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins              << optimized_filename;
4081e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    EmitBitcodeToFile(*module, optimized_filename);
4091e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
4101e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4111e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Finally, produce PTX.
4121e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  return EmitModuleToPTX(module, target_machine.get());
4131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
4141e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4151e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// One-time module initializer.
4161e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Must be called only once -- DO NOT CALL DIRECTLY.
4171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsvoid GPUBackendInit() {
4181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Feed all customized flags here, so we can override them with llvm_cl_opts
4191e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // without redeploy the compiler for development purpose.
4201e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // This flag tunes a threshold in branch folding. The default threshold, which
4221e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // is one, is not suitable for CUDA programs where branches are more expensive
4231e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // than for CPU programs. Setting the threshold to 2 improves the latency of
4241e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // TwoDPatchDotProductKernel_IND_3_ND_48 by over 5%, and does not affect the
4251e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // latency of other benchmarks so far.
4261e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  //
4271e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // I also tried setting this threshold to other values:
4281e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // * 3-6 gives similar results as 2;
4291e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // * >6 start hurting the performance of at least dot product kernels.
4301e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  //
4311e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // TODO(jingyue): The current threshold only considers the numbr of IR
4321e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // instructions which do not accurately reflect the true cost. We need a
4331e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // better cost model.
4341e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  FeedLLVMWithFlags({"-bonus-inst-threshold=2"});
4351e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // TODO(b/22073864): Increase limit when scan memory dependency.
4361e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // This helps to reduce more redundant load instructions.
4371e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  //
4381e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // The specific value is currently large enough for s3d in shoc benchmark,
4391e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // which contains a lot of load instructions and many arithmetic instructions
4401e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // between those loads.
4411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  FeedLLVMWithFlags({"-memdep-block-scan-limit=500"});
4421e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4431e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  legacy_flags::GpuBackendLibFlags* flags =
4441e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins      legacy_flags::GetGpuBackendLibFlags();
4451e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (!flags->llvm_cl_opts.empty()) {
4461e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    std::vector<string> opts =
4471e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins        tensorflow::str_util::Split(flags->llvm_cl_opts, ',');
4481e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    FeedLLVMWithFlags(opts);
4491e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
4501e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4511e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  if (flags->llvm_dump_passes) {
4521e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // Enable LLVM pass debugging dump. LLVM dumps this information when a pass
4531e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // manager is initialized for execution. It's done to stderr (this is
4541e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // hardcoded within LLVM to the dbgs() stream, we can't change it from the
4551e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    // outside).
4561e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    FeedLLVMWithFlags({"-debug-pass=Arguments"});
4571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
4581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Initialize the NVPTX target; it's the only target we link with, so call its
4601e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // specific initialization functions instead of the catch-all InitializeAll*.
4611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  LLVMInitializeNVPTXTarget();
4621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  LLVMInitializeNVPTXTargetInfo();
4631e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  LLVMInitializeNVPTXTargetMC();
4641e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  LLVMInitializeNVPTXAsmPrinter();
4651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  // Initialize the LLVM optimization passes.
4671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry();
4681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  InitializePasses(registry);
4691e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
4701e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4711e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}  // namespace
4721e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4731e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsStatusOr<string> CompileToPtx(llvm::Module* module,
4741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins                              const string& libdevice_dir_path) {
4751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  static std::once_flag backend_init_flag;
4761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  std::call_once(backend_init_flag, GPUBackendInit);
4771e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4781e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  string ptx;
4791e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  {
4801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    ScopedLoggingTimer compilation_timer(
4811e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins        "Compile module " + llvm_ir::AsString(module->getName()),
4821e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins        /*vlog_level=*/2);
4831e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins    TF_ASSIGN_OR_RETURN(ptx, CompileModuleToPtx(module, libdevice_dir_path));
4841e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  }
4851e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins  return ptx;
4861e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}
4871e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins
4881e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}  // namespace gpu
4891e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}  // namespace xla
490