gpu_backend_lib.cc revision 1e67c90e2caceeff82d09793d1ef5fa0300d219b
11e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 21e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 31e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsLicensed under the Apache License, Version 2.0 (the "License"); 41e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsyou may not use this file except in compliance with the License. 51e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsYou may obtain a copy of the License at 61e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 71e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins http://www.apache.org/licenses/LICENSE-2.0 81e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 91e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsUnless required by applicable law or agreed to in writing, software 101e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsdistributed under the License is distributed on an "AS IS" BASIS, 111e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 121e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsSee the License for the specific language governing permissions and 131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinslimitations under the License. 141e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins==============================================================================*/ 151e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 161e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h" 171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <map> 191e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <memory> 201e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <string> 211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <utility> 221e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 231e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.h" 241e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/ptr_util.h" 251e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h" 261e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h" 271e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" 281e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/util.h" 291e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 301e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/ADT/STLExtras.h" 311e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/ADT/StringMap.h" 321e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/ADT/StringSet.h" 331e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Analysis/TargetLibraryInfo.h" 341e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Analysis/TargetTransformInfo.h" 351e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Bitcode/BitcodeReader.h" 361e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Bitcode/BitcodeWriter.h" 371e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/CodeGen/CommandFlags.h" 381e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/IR/LLVMContext.h" 391e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/IR/LegacyPassManager.h" 401e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/IR/Module.h" 411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/LinkAllIR.h" 421e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/LinkAllPasses.h" 431e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Linker/Linker.h" 441e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/PassRegistry.h" 451e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Support/CommandLine.h" 461e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Support/FileSystem.h" 471e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Support/FormattedStream.h" 481e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Support/TargetRegistry.h" 491e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Support/TargetSelect.h" 501e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Support/ToolOutputFile.h" 511e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Target/TargetMachine.h" 521e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Transforms/IPO.h" 531e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Transforms/IPO/AlwaysInliner.h" 541e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "external/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h" 551e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 561e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/types.h" 571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/lib/core/stringpiece.h" 581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/lib/io/path.h" 591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/lib/strings/str_util.h" 601e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/lib/strings/stringprintf.h" 611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/platform/env.h" 621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/platform/logging.h" 631e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 641e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsnamespace xla { 651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsnamespace gpu { 661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsnamespace { 671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Default inline threshold value to use in llvm. 691e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsconst int kDefaultInlineThreshold = 1100; 701e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 711e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Information about a GPU architecture for the backend. 721e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsstruct GpuBackendInfo { 731e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins string libdevice_name; 741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins string sm_name; 751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}; 761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 771e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Maps supported CUDA compute capability to a libdevice file to link for this 781e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// capability. 791e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsstd::map<string, GpuBackendInfo> gpu_info_map = { 801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins {"compute_20", {"libdevice.compute_20.10.bc", "sm_20"}}, 811e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins {"compute_30", {"libdevice.compute_30.10.bc", "sm_30"}}, 821e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins {"compute_35", {"libdevice.compute_35.10.bc", "sm_35"}}, 831e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 841e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // NVIDIA does not provide a separate libdevice for CC 3.7, but we can use 851e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // the one for 3.5. 861e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins {"compute_37", {"libdevice.compute_35.10.bc", "sm_37"}}, 871e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins}; 881e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 891e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Validate the --gpu_architecture command-line flag. 901e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsstatic void ValidateGPUArchitecture(const string& value) { 911e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (!gpu_info_map.count(value)) { 921e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LOG(FATAL) << "value for --gpu_architecture must be compute_{20,30,35,37}"; 931e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 941e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 951e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 961e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Convenience function for producing a name of a temporary compilation product 971e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// from the input filename. 981e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsstring MakeNameForTempProduct(const std::string& input_filename, 991e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins tensorflow::StringPiece extension) { 1001e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins legacy_flags::GpuBackendLibFlags* flags = 1011e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins legacy_flags::GetGpuBackendLibFlags(); 1021e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins return tensorflow::io::JoinPath( 1031e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins flags->dump_temp_products_to, 1041e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins ReplaceFilenameExtension( 1051e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins tensorflow::io::Basename(llvm_ir::AsString(input_filename)), 1061e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins extension)); 1071e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 1081e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 1091e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Initializes LLVM passes. Uses the PassRegistry mechanism. 1101e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsvoid InitializePasses(llvm::PassRegistry* pass_registry) { 1111e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeCore(*pass_registry); 1121e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeCodeGen(*pass_registry); 1131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeScalarOpts(*pass_registry); 1141e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeObjCARCOpts(*pass_registry); 1151e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeVectorization(*pass_registry); 1161e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeIPO(*pass_registry); 1171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeAnalysis(*pass_registry); 1181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeTransformUtils(*pass_registry); 1191e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeInstCombine(*pass_registry); 1201e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeInstrumentation(*pass_registry); 1211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeTarget(*pass_registry); 1221e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeCodeGenPreparePass(*pass_registry); 1231e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 1241e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 1251e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Returns the TargetMachine, given a triple. 1261e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsstd::unique_ptr<llvm::TargetMachine> GetTargetMachine( 1271e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::Triple triple, tensorflow::StringPiece cpu_name) { 1281e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins std::string error; 1291e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins const llvm::Target* target = TargetRegistry::lookupTarget("", triple, error); 1301e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (target == nullptr) { 1311e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LOG(FATAL) << "Unable to find Target for triple '" << triple.str() << "'" 1321e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins << " -- " << error; 1331e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins return nullptr; 1341e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 1351e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 1361e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins TargetOptions target_options = InitTargetOptionsFromCodeGenFlags(); 1371e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Enable FMA synthesis if desired. 1381e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins legacy_flags::GpuBackendLibFlags* flags = 1391e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins legacy_flags::GetGpuBackendLibFlags(); 1401e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (flags->fma) { 1411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins target_options.AllowFPOpFusion = FPOpFusion::Fast; 1421e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 1431e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 1441e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Set options from LlvmBackendFlags (specifically, fast-math flags). 1451e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm_ir::SetTargetOptions(&target_options); 1461e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 1471e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Set the verbose assembly options. 1481e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins target_options.MCOptions.AsmVerbose = flags->verbose_ptx_asm; 1491e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 1501e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // The selection of codegen optimization level is copied from function 1511e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // GetCodeGenOptLevel in //external/llvm/tools/opt/opt.cpp. 1521e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins CodeGenOpt::Level codegen_opt_level; 1531e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins switch (flags->opt_level) { 1541e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins case 1: 1551e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins codegen_opt_level = CodeGenOpt::Less; 1561e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins break; 1571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins case 2: 1581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins codegen_opt_level = CodeGenOpt::Default; 1591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins break; 1601e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins case 3: 1611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins codegen_opt_level = CodeGenOpt::Aggressive; 1621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins break; 1631e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins default: 1641e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins codegen_opt_level = CodeGenOpt::None; 1651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 1661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins return WrapUnique(target->createTargetMachine( 1671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins triple.str(), llvm_ir::AsStringRef(cpu_name), "+ptx42", target_options, 1681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins Optional<Reloc::Model>(RelocModel), CMModel, codegen_opt_level)); 1691e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 1701e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 1711e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Adds the standard LLVM optimization passes, based on the speed optimization 1721e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// level (opt_level) and size optimization level (size_level). Both module 1731e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// and function-level passes are added, so two pass managers are passed in and 1741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// modified by this function. 1751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsvoid AddOptimizationPasses(unsigned opt_level, unsigned size_level, 1761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::TargetMachine* target_machine, 1771e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::legacy::PassManagerBase* module_passes, 1781e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::legacy::FunctionPassManager* function_passes) { 1791e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins PassManagerBuilder builder; 1801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.OptLevel = opt_level; 1811e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.SizeLevel = size_level; 1821e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 1831e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (opt_level > 1) { 1841e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.Inliner = llvm::createFunctionInliningPass(kDefaultInlineThreshold); 1851e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } else { 1861e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Only inline functions marked with "alwaysinline". 1871e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.Inliner = llvm::createAlwaysInlinerLegacyPass(); 1881e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 1891e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 1901e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.DisableUnitAtATime = false; 1911e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.DisableUnrollLoops = opt_level == 0; 1921e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.LoopVectorize = opt_level > 0; 1931e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.SLPVectorize = opt_level > 1 && size_level < 2; 1941e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 1951e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // NVPTX's early-as-possible passes include NVVM reflect. 1961e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.addExtension( 1971e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::PassManagerBuilder::EP_EarlyAsPossible, 1981e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins [&](const PassManagerBuilder&, legacy::PassManagerBase& pass_manager) { 1991e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins target_machine->addEarlyAsPossiblePasses(pass_manager); 2001e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins }); 2011e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2021e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.populateFunctionPassManager(*function_passes); 2031e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.populateModulePassManager(*module_passes); 2041e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 2051e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2061e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Emits the given module to a bit code file. 2071e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsvoid EmitBitcodeToFile(const Module& module, tensorflow::StringPiece filename) { 2081e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins std::error_code error_code; 2091e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::tool_output_file outfile(filename.ToString().c_str(), error_code, 2101e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::sys::fs::F_None); 2111e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (error_code) { 2121e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LOG(FATAL) << "opening bitcode file for writing: " << error_code.message(); 2131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 2141e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2151e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::WriteBitcodeToFile(&module, outfile.os()); 2161e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins outfile.keep(); 2171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 2181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2191e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Emits the given module to PTX. target_machine is an initialized TargetMachine 2201e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// for the NVPTX target. 2211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsstring EmitModuleToPTX(Module* module, llvm::TargetMachine* target_machine) { 2221e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins std::string ptx; // need a std::string instead of a ::string. 2231e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins { 2241e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::raw_string_ostream stream(ptx); 2251e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::buffer_ostream pstream(stream); 2261e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // The extension is stripped by IrDumpingPassManager, so we need to 2271e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // get creative to add a suffix. 2281e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins string module_id(llvm_ir::AsString(module->getModuleIdentifier())); 2291e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins legacy_flags::GpuBackendLibFlags* flags = 2301e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins legacy_flags::GetGpuBackendLibFlags(); 2311e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins IrDumpingPassManager codegen_passes( 2321e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins ReplaceFilenameExtension(tensorflow::io::Basename(module_id), 2331e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins "-nvptx.dummy"), 2341e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins flags->dump_temp_products_to, flags->dump_ir_before_passes); 2351e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass( 2361e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::Triple(module->getTargetTriple()))); 2371e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2381e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins target_machine->addPassesToEmitFile(codegen_passes, pstream, 2391e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::TargetMachine::CGFT_AssemblyFile); 2401e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins codegen_passes.run(*module); 2411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 2421e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2431e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins return ptx; 2441e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 2451e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2461e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// LLVM has an extensive flags mechanism of its own, which is only accessible 2471e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// through the command line. Internal libraries within LLVM register parsers for 2481e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// flags, with no other way to configure them except pass these flags. 2491e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// To do this programmatically, we invoke ParseCommandLineOptions manually with 2501e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// a "fake argv". 2511e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Note: setting flags with this method is stateful, since flags are just 2521e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// static globals within LLVM libraries. 2531e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsvoid FeedLLVMWithFlags(const std::vector<string>& cl_opts) { 2541e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins std::vector<const char*> fake_argv = {""}; 2551e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins for (const string& cl_opt : cl_opts) { 2561e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins fake_argv.push_back(cl_opt.c_str()); 2571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 2581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::cl::ParseCommandLineOptions(fake_argv.size(), &fake_argv[0]); 2591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 2601e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsnamespace { 2621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Returns whether the module could use any libdevice functions. This function 2631e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// may have false positives -- the module might not use libdevice even if this 2641e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// function returns true. 2651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsbool CouldNeedLibdevice(const llvm::Module& module) { 2661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins for (const llvm::Function& function : module.functions()) { 2671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // This is a conservative approximation -- not all such functions are in 2681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // libdevice. 2691e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (!function.isIntrinsic() && function.isDeclaration()) { 2701e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins return true; 2711e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 2721e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 2731e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins return false; 2741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 2751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Links libdevice into the given module if the module needs libdevice. 2771e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinstensorflow::Status LinkLibdeviceIfNecessary(const string& libdevice_dir_path, 2781e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::Module* module) { 2791e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (!CouldNeedLibdevice(*module)) { 2801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins return tensorflow::Status::OK(); 2811e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 2821e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2831e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::Linker linker(*module); 2841e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins legacy_flags::GpuBackendLibFlags* flags = 2851e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins legacy_flags::GetGpuBackendLibFlags(); 2861e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins ValidateGPUArchitecture(flags->gpu_architecture); 2871e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins string libdevice_bc_filename = 2881e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins gpu_info_map[flags->gpu_architecture].libdevice_name; 2891e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins string libdevice_bc_fullpath = 2901e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins tensorflow::io::JoinPath(libdevice_dir_path, libdevice_bc_filename); 2911e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins TF_RETURN_IF_ERROR( 2921e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins tensorflow::Env::Default()->FileExists(libdevice_bc_fullpath)); 2931e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins std::unique_ptr<llvm::Module> libdevice_module = 2941e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LoadIRModule(libdevice_bc_fullpath, &module->getContext()); 2951e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins VLOG(1) << "Linking with libdevice from: " << libdevice_bc_fullpath; 2961e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (linker.linkInModule(std::move(libdevice_module), 2971e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::Linker::Flags::InternalizeLinkedSymbols | 2981e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::Linker::Flags::LinkOnlyNeeded)) { 2991e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LOG(FATAL) << "Error linking libdevice from " << libdevice_bc_fullpath; 3001e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 3011e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins return tensorflow::Status::OK(); 3021e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 3031e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3041e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} // namespace 3051e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3061e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsStatusOr<string> CompileModuleToPtx(llvm::Module* module, 3071e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins const string& libdevice_dir_path) { 3081e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Link the input module with libdevice, to pull in implementations of some 3091e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // builtins. 3101e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins TF_RETURN_IF_ERROR(LinkLibdeviceIfNecessary(libdevice_dir_path, module)); 3111e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3121e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins legacy_flags::GpuBackendLibFlags* flags = 3131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins legacy_flags::GetGpuBackendLibFlags(); 3141e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (!flags->dump_temp_products_to.empty()) { 3151e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins string linked_filename = 3161e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins MakeNameForTempProduct(module->getModuleIdentifier(), "linked.bc"); 3171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LOG(INFO) << "dumping bitcode after linking libdevice to: " 3181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins << linked_filename; 3191e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins EmitBitcodeToFile(*module, linked_filename); 3201e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 3211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3221e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Set the flush-denormals-to-zero flag on the module so the NVVM reflect pass 3231e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // can access it. 3241e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", flags->ftz); 3251e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3261e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // If ftz is enabled, set it as an attribute on every function in the module. 3271e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (flags->ftz) { 3281e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins for (llvm::Function& fn : *module) { 3291e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins fn.addFnAttr("nvptx-f32ftz", "true"); 3301e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 3311e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 3321e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3331e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Run IR-level optimizations. 3341e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (flags->dump_ir_before_passes && flags->dump_temp_products_to.empty()) { 3351e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LOG(FATAL) << "--dump_ir_before_passes must be specified with " 3361e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins "--dump_temp_products_to"; 3371e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 3381e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3391e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins IrDumpingPassManager module_passes(module->getModuleIdentifier(), 3401e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins flags->dump_temp_products_to, 3411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins flags->dump_ir_before_passes); 3421e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3431e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Add an appropriate TargetLibraryInfo pass for the module's triple. 3441e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::TargetLibraryInfoWrapperPass* tliwp = 3451e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins new llvm::TargetLibraryInfoWrapperPass( 3461e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::Triple(module->getTargetTriple())); 3471e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins module_passes.add(tliwp); 3481e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3491e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Try to fetch the target triple from the module. If not present, set a 3501e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // default target triple. 3511e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::Triple target_triple = llvm::Triple(module->getTargetTriple()); 3521e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (target_triple.getArch() == llvm::Triple::UnknownArch) { 3531e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LOG(WARNING) << "target triple not found in the module"; 3541e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins target_triple = llvm::Triple("nvptx64-unknown-unknown"); 3551e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 3561e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Figure out the exact name of the processor as known to the NVPTX backend 3581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // from the gpu_architecture flag. 3591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins ValidateGPUArchitecture(flags->gpu_architecture); 3601e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins string cpu_name = gpu_info_map[flags->gpu_architecture].sm_name; 3611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins std::unique_ptr<llvm::TargetMachine> target_machine = 3631e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins GetTargetMachine(target_triple, cpu_name); 3641e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins module_passes.add(llvm::createTargetTransformInfoWrapperPass( 3651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins target_machine->getTargetIRAnalysis())); 3661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // The LLVM IR verifier performs sanity checking on the IR. This helps 3681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // discover problems and report them in a meaningful manner, rather than let 3691e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // later passes report obscure assertions becasue of unfulfilled invariants. 3701e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins module_passes.add(llvm::createVerifierPass()); 3711e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3721e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Create the function-level pass manager. It needs data layout information 3731e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // too. 3741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::legacy::FunctionPassManager function_passes(module); 3751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins AddOptimizationPasses(flags->opt_level, /*size_level=*/0, 3771e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins target_machine.get(), &module_passes, &function_passes); 3781e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Loop unrolling exposes more opportunites for SROA. Therefore, we run SROA 3791e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // again after the standard optimization passes [http://b/13329423]. 3801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // TODO(jingyue): SROA may further expose more optimization opportunites, such 3811e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // as more precise alias analysis and more function inlining (SROA may change 3821e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // the inlining cost of a function). For now, running SROA already emits good 3831e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // enough code for the evaluated benchmarks. We may want to run more 3841e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // optimizations later. 3851e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (flags->opt_level > 0) { 3861e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // LLVM's optimizer turns on SROA when the optimization level is greater 3871e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // than 0. We mimic this behavior here. 3881e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins module_passes.add(llvm::createSROAPass()); 3891e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 3901e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3911e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Verify that the module is well formed after optimizations ran. 3921e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins module_passes.add(llvm::createVerifierPass()); 3931e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3941e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Done populating the pass managers. Now run them. 3951e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3961e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins function_passes.doInitialization(); 3971e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins for (auto func = module->begin(); func != module->end(); ++func) { 3981e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins function_passes.run(*func); 3991e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 4001e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins function_passes.doFinalization(); 4011e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins module_passes.run(*module); 4021e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4031e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (!flags->dump_temp_products_to.empty()) { 4041e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins string optimized_filename = 4051e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins MakeNameForTempProduct(module->getModuleIdentifier(), "optimized.bc"); 4061e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LOG(INFO) << "dumping bitcode after optimizations to: " 4071e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins << optimized_filename; 4081e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins EmitBitcodeToFile(*module, optimized_filename); 4091e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 4101e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4111e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Finally, produce PTX. 4121e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins return EmitModuleToPTX(module, target_machine.get()); 4131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 4141e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4151e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// One-time module initializer. 4161e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Must be called only once -- DO NOT CALL DIRECTLY. 4171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsvoid GPUBackendInit() { 4181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Feed all customized flags here, so we can override them with llvm_cl_opts 4191e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // without redeploy the compiler for development purpose. 4201e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // This flag tunes a threshold in branch folding. The default threshold, which 4221e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // is one, is not suitable for CUDA programs where branches are more expensive 4231e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // than for CPU programs. Setting the threshold to 2 improves the latency of 4241e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // TwoDPatchDotProductKernel_IND_3_ND_48 by over 5%, and does not affect the 4251e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // latency of other benchmarks so far. 4261e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // 4271e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // I also tried setting this threshold to other values: 4281e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // * 3-6 gives similar results as 2; 4291e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // * >6 start hurting the performance of at least dot product kernels. 4301e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // 4311e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // TODO(jingyue): The current threshold only considers the numbr of IR 4321e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // instructions which do not accurately reflect the true cost. We need a 4331e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // better cost model. 4341e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins FeedLLVMWithFlags({"-bonus-inst-threshold=2"}); 4351e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // TODO(b/22073864): Increase limit when scan memory dependency. 4361e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // This helps to reduce more redundant load instructions. 4371e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // 4381e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // The specific value is currently large enough for s3d in shoc benchmark, 4391e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // which contains a lot of load instructions and many arithmetic instructions 4401e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // between those loads. 4411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins FeedLLVMWithFlags({"-memdep-block-scan-limit=500"}); 4421e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4431e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins legacy_flags::GpuBackendLibFlags* flags = 4441e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins legacy_flags::GetGpuBackendLibFlags(); 4451e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (!flags->llvm_cl_opts.empty()) { 4461e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins std::vector<string> opts = 4471e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins tensorflow::str_util::Split(flags->llvm_cl_opts, ','); 4481e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins FeedLLVMWithFlags(opts); 4491e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 4501e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4511e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (flags->llvm_dump_passes) { 4521e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Enable LLVM pass debugging dump. LLVM dumps this information when a pass 4531e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // manager is initialized for execution. It's done to stderr (this is 4541e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // hardcoded within LLVM to the dbgs() stream, we can't change it from the 4551e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // outside). 4561e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins FeedLLVMWithFlags({"-debug-pass=Arguments"}); 4571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 4581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Initialize the NVPTX target; it's the only target we link with, so call its 4601e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // specific initialization functions instead of the catch-all InitializeAll*. 4611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LLVMInitializeNVPTXTarget(); 4621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LLVMInitializeNVPTXTargetInfo(); 4631e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LLVMInitializeNVPTXTargetMC(); 4641e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LLVMInitializeNVPTXAsmPrinter(); 4651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Initialize the LLVM optimization passes. 4671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry(); 4681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins InitializePasses(registry); 4691e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 4701e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4711e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} // namespace 4721e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4731e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsStatusOr<string> CompileToPtx(llvm::Module* module, 4741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins const string& libdevice_dir_path) { 4751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins static std::once_flag backend_init_flag; 4761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins std::call_once(backend_init_flag, GPUBackendInit); 4771e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4781e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins string ptx; 4791e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins { 4801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins ScopedLoggingTimer compilation_timer( 4811e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins "Compile module " + llvm_ir::AsString(module->getName()), 4821e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins /*vlog_level=*/2); 4831e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins TF_ASSIGN_OR_RETURN(ptx, CompileModuleToPtx(module, libdevice_dir_path)); 4841e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 4851e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins return ptx; 4861e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 4871e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4881e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} // namespace gpu 4891e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} // namespace xla 490