11e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 21e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 31e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsLicensed under the Apache License, Version 2.0 (the "License"); 41e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsyou may not use this file except in compliance with the License. 51e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsYou may obtain a copy of the License at 61e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 71e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins http://www.apache.org/licenses/LICENSE-2.0 81e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 91e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsUnless required by applicable law or agreed to in writing, software 101e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsdistributed under the License is distributed on an "AS IS" BASIS, 111e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 121e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsSee the License for the specific language governing permissions and 131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinslimitations under the License. 141e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins==============================================================================*/ 151e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 161e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h" 171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <map> 191e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <memory> 201e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <string> 211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include <utility> 221e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 231e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/ptr_util.h" 241e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h" 251e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h" 261e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" 27abbb19bb9445ffee96ff2946083a3b5c8dadc0d0Eli Bendersky#include "tensorflow/compiler/xla/status_macros.h" 281e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/util.h" 291e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3034cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/ADT/STLExtras.h" 3134cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/ADT/StringMap.h" 3234cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/ADT/StringSet.h" 3334cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Analysis/TargetLibraryInfo.h" 3434cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Analysis/TargetTransformInfo.h" 3534cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Bitcode/BitcodeReader.h" 3634cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Bitcode/BitcodeWriter.h" 374e9fa6dcce4912a4797c48f4cb55d3564961bfcaA. Unique TensorFlower#include "llvm/CodeGen/CommandFlags.def" 3834cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/IR/LLVMContext.h" 3934cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/IR/LegacyPassManager.h" 4034cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/IR/Module.h" 4169db15cc49f84baaff27a19d792102876e811c97A. Unique TensorFlower#include "llvm/IR/Verifier.h" 4234cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Linker/Linker.h" 4334cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/PassRegistry.h" 4434cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Support/CommandLine.h" 4534cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Support/FileSystem.h" 4634cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Support/FormattedStream.h" 4734cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Support/TargetRegistry.h" 4834cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Support/TargetSelect.h" 4934cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Support/ToolOutputFile.h" 5034cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Target/TargetMachine.h" 5134cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Transforms/IPO.h" 5234cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Transforms/IPO/AlwaysInliner.h" 5334cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa#include "llvm/Transforms/IPO/Internalize.h" 5469db15cc49f84baaff27a19d792102876e811c97A. Unique TensorFlower#include "llvm/Transforms/IPO/PassManagerBuilder.h" 5569db15cc49f84baaff27a19d792102876e811c97A. Unique TensorFlower#include "llvm/Transforms/Scalar.h" 561e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/compiler/xla/types.h" 571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/lib/core/stringpiece.h" 581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/lib/io/path.h" 591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/lib/strings/str_util.h" 601e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/lib/strings/stringprintf.h" 611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/platform/env.h" 621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins#include "tensorflow/core/platform/logging.h" 630bd46f52dad251846996bf440177128a16d429c2Artem Belevich#include "tensorflow/core/platform/tracing.h" 641e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsnamespace xla { 661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsnamespace gpu { 671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsnamespace { 681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 691e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Default inline threshold value to use in llvm. 701e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsconst int kDefaultInlineThreshold = 1100; 711e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 727b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar// Gets the libdevice filename for a particular compute capability. When 737b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar// presented with a GPU we don't recognize, we just return the libdevice from 747b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar// compute_20. 75a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyenstatic string GetLibdeviceFilename(const string& libdevice_dir_path, 76a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen std::pair<int, int> compute_capability) { 77a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen // Since CUDA 9.0, all GPU versions are included in a single file 78a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen const char* unified_libdevice_filename = "libdevice.10.bc"; 79a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen std::vector<string> unified_libdevice_files; 80191825e63f341a4e7777b85254f616e541000d5cA. Unique TensorFlower const tensorflow::Status status = 81a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen tensorflow::Env::Default()->GetMatchingPaths( 82a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen tensorflow::io::JoinPath(libdevice_dir_path, unified_libdevice_filename), 83a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen &unified_libdevice_files); 84a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen if (status.ok() && unified_libdevice_files.size() == 1) { 85a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen return unified_libdevice_filename; 86a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen } 877b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar // There are only four libdevice files: compute_{20,30,35,50}. Each GPU 887b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar // version gets mapped to one of these. Note in particular that sm_60 and 897b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar // sm_61 map to libdevice.compute_30. 907b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar static auto* m = new std::map<std::pair<int, int>, int>({{{2, 0}, 20}, 917b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{2, 1}, 20}, 927b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{3, 0}, 30}, 937b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{3, 2}, 30}, 947b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{3, 5}, 35}, 957b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{3, 7}, 35}, 967b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{5, 0}, 50}, 977b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{5, 2}, 50}, 987b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{5, 3}, 50}, 997b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{6, 0}, 30}, 1007b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{6, 1}, 30}, 1017b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{6, 2}, 30}}); 1027b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar int libdevice_version = 20; 1037b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar auto it = m->find(compute_capability); 1047b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar if (it != m->end()) { 1057b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar libdevice_version = it->second; 1067b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar } else { 1077b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar LOG(WARNING) << "Unknown compute capability (" << compute_capability.first 1087b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar << ", " << compute_capability.second << ") ." 1097b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar << "Defaulting to libdevice for compute_" << libdevice_version; 1107b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar } 1117b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar return tensorflow::strings::StrCat("libdevice.compute_", libdevice_version, 1127b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar ".10.bc"); 1137b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar} 1147b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar 1157b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar// Gets the GPU name as it's known to LLVM for a given compute capability. If 116a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen// we see an unrecognized compute capability, we return "sm_30". 1177b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebarstatic string GetSmName(std::pair<int, int> compute_capability) { 1187b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar static auto* m = new std::map<std::pair<int, int>, int>({{{2, 0}, 20}, 1197b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{2, 1}, 21}, 1207b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{3, 0}, 30}, 1217b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{3, 2}, 32}, 1227b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{3, 5}, 35}, 1237b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{3, 7}, 37}, 1247b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{5, 0}, 50}, 1257b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{5, 2}, 52}, 1267b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{5, 3}, 53}, 1277b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{6, 0}, 60}, 1287b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar {{6, 1}, 61}, 129a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen {{6, 2}, 62}, 130a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen // TODO: Change this to 70 once LLVM NVPTX supports it 131a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen {{7, 0}, 60}}); 132a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen int sm_version = 30; 1337b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar auto it = m->find(compute_capability); 1347b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar if (it != m->end()) { 1357b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar sm_version = it->second; 1367b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar } else { 1377b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar LOG(WARNING) << "Unknown compute capability (" << compute_capability.first 1387b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar << ", " << compute_capability.second << ") ." 1397b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar << "Defaulting to telling LLVM that we're compiling for sm_" 1407b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar << sm_version; 1411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 1427b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar return tensorflow::strings::StrCat("sm_", sm_version); 1431e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 1441e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 1451e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Convenience function for producing a name of a temporary compilation product 1461e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// from the input filename. 1471e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsstring MakeNameForTempProduct(const std::string& input_filename, 1481e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins tensorflow::StringPiece extension) { 14912efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky return ReplaceFilenameExtension( 15012efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky tensorflow::io::Basename(llvm_ir::AsString(input_filename)), extension); 1511e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 1521e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 1531e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Initializes LLVM passes. Uses the PassRegistry mechanism. 1541e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsvoid InitializePasses(llvm::PassRegistry* pass_registry) { 1551e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeCore(*pass_registry); 1561e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeCodeGen(*pass_registry); 1571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeScalarOpts(*pass_registry); 1581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeObjCARCOpts(*pass_registry); 1591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeVectorization(*pass_registry); 1601e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeIPO(*pass_registry); 1611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeAnalysis(*pass_registry); 1621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeTransformUtils(*pass_registry); 1631e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeInstCombine(*pass_registry); 1641e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeInstrumentation(*pass_registry); 1651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeTarget(*pass_registry); 1661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::initializeCodeGenPreparePass(*pass_registry); 1671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 1681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 1691e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Returns the TargetMachine, given a triple. 1701e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsstd::unique_ptr<llvm::TargetMachine> GetTargetMachine( 171d45505fe0c7ab9a10f16682f54d0eb54c4776cd1Justin Lebar llvm::Triple triple, tensorflow::StringPiece cpu_name, 172d45505fe0c7ab9a10f16682f54d0eb54c4776cd1Justin Lebar const HloModuleConfig& hlo_module_config) { 1731e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins std::string error; 1741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins const llvm::Target* target = TargetRegistry::lookupTarget("", triple, error); 1751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (target == nullptr) { 1761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LOG(FATAL) << "Unable to find Target for triple '" << triple.str() << "'" 1771e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins << " -- " << error; 1781e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins return nullptr; 1791e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 1801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 1811e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins TargetOptions target_options = InitTargetOptionsFromCodeGenFlags(); 182abbb19bb9445ffee96ff2946083a3b5c8dadc0d0Eli Bendersky llvm_ir::SetTargetOptions( 18332e37a1ff2587e02fd35e316c0ac00dcc4e72d17Eli Bendersky /*fast_math_enabled=*/hlo_module_config.debug_options() 18432e37a1ff2587e02fd35e316c0ac00dcc4e72d17Eli Bendersky .xla_enable_fast_math(), 185abbb19bb9445ffee96ff2946083a3b5c8dadc0d0Eli Bendersky &target_options); 186d45505fe0c7ab9a10f16682f54d0eb54c4776cd1Justin Lebar 18712efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky // Enable FMA synthesis. 18812efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky target_options.AllowFPOpFusion = FPOpFusion::Fast; 1891e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 1901e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Set the verbose assembly options. 19112efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky target_options.MCOptions.AsmVerbose = false; 1921e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 1931e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // The selection of codegen optimization level is copied from function 19434cbf161d7b1191ad5c1b3bc02fc52d338e8b175Jiri Simsa // GetCodeGenOptLevel in //third_party/llvm/llvm/tools/opt/opt.cpp. 1951e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins CodeGenOpt::Level codegen_opt_level; 19612efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky switch (hlo_module_config.debug_options().xla_backend_optimization_level()) { 1971e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins case 1: 1981e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins codegen_opt_level = CodeGenOpt::Less; 1991e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins break; 2001e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins case 2: 2011e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins codegen_opt_level = CodeGenOpt::Default; 2021e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins break; 2031e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins case 3: 2041e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins codegen_opt_level = CodeGenOpt::Aggressive; 2051e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins break; 2061e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins default: 2071e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins codegen_opt_level = CodeGenOpt::None; 2081e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 2091e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins return WrapUnique(target->createTargetMachine( 2101e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins triple.str(), llvm_ir::AsStringRef(cpu_name), "+ptx42", target_options, 2113f7c8210fea1a6559b1484b1933f59c1e844fdb1Benjamin Kramer Optional<Reloc::Model>(RelocModel), Optional<CodeModel::Model>(CMModel), 2123f7c8210fea1a6559b1484b1933f59c1e844fdb1Benjamin Kramer codegen_opt_level)); 2131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 2141e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2151e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Adds the standard LLVM optimization passes, based on the speed optimization 2161e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// level (opt_level) and size optimization level (size_level). Both module 2171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// and function-level passes are added, so two pass managers are passed in and 2181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// modified by this function. 2191e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsvoid AddOptimizationPasses(unsigned opt_level, unsigned size_level, 2201e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::TargetMachine* target_machine, 2211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::legacy::PassManagerBase* module_passes, 2221e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::legacy::FunctionPassManager* function_passes) { 2231e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins PassManagerBuilder builder; 2241e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.OptLevel = opt_level; 2251e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.SizeLevel = size_level; 2261e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2271e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (opt_level > 1) { 2281e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.Inliner = llvm::createFunctionInliningPass(kDefaultInlineThreshold); 2291e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } else { 2301e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Only inline functions marked with "alwaysinline". 2311e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.Inliner = llvm::createAlwaysInlinerLegacyPass(); 2321e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 2331e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2341e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.DisableUnitAtATime = false; 2351e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.DisableUnrollLoops = opt_level == 0; 2361e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.LoopVectorize = opt_level > 0; 2371e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.SLPVectorize = opt_level > 1 && size_level < 2; 2381e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2391e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // NVPTX's early-as-possible passes include NVVM reflect. 24095c7dfc16063c77871257b4da8d8958c731d19c2A. Unique TensorFlower target_machine->adjustPassManager(builder); 2411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2421e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.populateFunctionPassManager(*function_passes); 2431e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins builder.populateModulePassManager(*module_passes); 2441e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 2451e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2461e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Emits the given module to a bit code file. 2471e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsvoid EmitBitcodeToFile(const Module& module, tensorflow::StringPiece filename) { 2481e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins std::error_code error_code; 249321a2ebdf39b3e779a9c677aa6c1249a7aa70f49Benjamin Kramer llvm::ToolOutputFile outfile(filename.ToString().c_str(), error_code, 250321a2ebdf39b3e779a9c677aa6c1249a7aa70f49Benjamin Kramer llvm::sys::fs::F_None); 2511e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (error_code) { 2521e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LOG(FATAL) << "opening bitcode file for writing: " << error_code.message(); 2531e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 2541e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 255f77256a164ccb173a85472286311644db11ae5b1Benjamin Kramer llvm::WriteBitcodeToFile(module, outfile.os()); 2561e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins outfile.keep(); 2571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 2581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Emits the given module to PTX. target_machine is an initialized TargetMachine 2601e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// for the NVPTX target. 2611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsstring EmitModuleToPTX(Module* module, llvm::TargetMachine* target_machine) { 2621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins std::string ptx; // need a std::string instead of a ::string. 2631e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins { 2641e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::raw_string_ostream stream(ptx); 2651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::buffer_ostream pstream(stream); 2661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // The extension is stripped by IrDumpingPassManager, so we need to 2671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // get creative to add a suffix. 2681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins string module_id(llvm_ir::AsString(module->getModuleIdentifier())); 2691e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins IrDumpingPassManager codegen_passes( 2701e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins ReplaceFilenameExtension(tensorflow::io::Basename(module_id), 2711e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins "-nvptx.dummy"), 27212efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky "", false); 2731e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass( 2741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::Triple(module->getTargetTriple()))); 2751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins target_machine->addPassesToEmitFile(codegen_passes, pstream, 2771e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::TargetMachine::CGFT_AssemblyFile); 2781e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins codegen_passes.run(*module); 2791e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 2801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2811e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins return ptx; 2821e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 2831e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2841e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// LLVM has an extensive flags mechanism of its own, which is only accessible 2851e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// through the command line. Internal libraries within LLVM register parsers for 2861e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// flags, with no other way to configure them except pass these flags. 2871e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// To do this programmatically, we invoke ParseCommandLineOptions manually with 2881e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// a "fake argv". 2891e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Note: setting flags with this method is stateful, since flags are just 2901e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// static globals within LLVM libraries. 2911e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsvoid FeedLLVMWithFlags(const std::vector<string>& cl_opts) { 2921e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins std::vector<const char*> fake_argv = {""}; 2931e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins for (const string& cl_opt : cl_opts) { 2941e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins fake_argv.push_back(cl_opt.c_str()); 2951e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 2961e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::cl::ParseCommandLineOptions(fake_argv.size(), &fake_argv[0]); 2971e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 2981e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 2991e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Returns whether the module could use any libdevice functions. This function 3001e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// may have false positives -- the module might not use libdevice even if this 3011e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// function returns true. 3021e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkinsbool CouldNeedLibdevice(const llvm::Module& module) { 3031e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins for (const llvm::Function& function : module.functions()) { 3041e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // This is a conservative approximation -- not all such functions are in 3051e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // libdevice. 3061e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (!function.isIntrinsic() && function.isDeclaration()) { 3071e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins return true; 3081e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 3091e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 3101e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins return false; 3111e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 3121e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Links libdevice into the given module if the module needs libdevice. 3147b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebartensorflow::Status LinkLibdeviceIfNecessary( 3157b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar llvm::Module* module, std::pair<int, int> compute_capability, 3167b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar const string& libdevice_dir_path) { 3171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (!CouldNeedLibdevice(*module)) { 3181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins return tensorflow::Status::OK(); 3191e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 3201e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::Linker linker(*module); 3227b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar string libdevice_path = tensorflow::io::JoinPath( 323a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen libdevice_dir_path, GetLibdeviceFilename(libdevice_dir_path, 324a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen compute_capability)); 3257b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar TF_RETURN_IF_ERROR(tensorflow::Env::Default()->FileExists(libdevice_path)); 3267b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar VLOG(1) << "Linking with libdevice from: " << libdevice_path; 3271e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins std::unique_ptr<llvm::Module> libdevice_module = 3287b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar LoadIRModule(libdevice_path, &module->getContext()); 32900f8415dcada6e416fe67abb99675abbde16845dA. Unique TensorFlower if (linker.linkInModule( 33000f8415dcada6e416fe67abb99675abbde16845dA. Unique TensorFlower std::move(libdevice_module), llvm::Linker::Flags::LinkOnlyNeeded, 33100f8415dcada6e416fe67abb99675abbde16845dA. Unique TensorFlower [](Module& M, const StringSet<>& GVS) { 33200f8415dcada6e416fe67abb99675abbde16845dA. Unique TensorFlower internalizeModule(M, [&M, &GVS](const GlobalValue& GV) { 33300f8415dcada6e416fe67abb99675abbde16845dA. Unique TensorFlower return !GV.hasName() || (GVS.count(GV.getName()) == 0); 33400f8415dcada6e416fe67abb99675abbde16845dA. Unique TensorFlower }); 33500f8415dcada6e416fe67abb99675abbde16845dA. Unique TensorFlower })) { 3367b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar return tensorflow::errors::Internal(tensorflow::strings::StrCat( 3377b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar "Error linking libdevice from ", libdevice_path)); 3381e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 3391e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins return tensorflow::Status::OK(); 3401e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 3411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3421e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsStatusOr<string> CompileModuleToPtx(llvm::Module* module, 3437b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar std::pair<int, int> compute_capability, 344d45505fe0c7ab9a10f16682f54d0eb54c4776cd1Justin Lebar const HloModuleConfig& hlo_module_config, 3451e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins const string& libdevice_dir_path) { 34666fc99a3b53c2e77d1c8569e1597a0094b0f99a8Artem Belevich // If the module has no functions or globals, there's nothing to compile. Just 34766fc99a3b53c2e77d1c8569e1597a0094b0f99a8Artem Belevich // return an empty string. 34866fc99a3b53c2e77d1c8569e1597a0094b0f99a8Artem Belevich if (module->empty() && module->global_empty()) { 34966fc99a3b53c2e77d1c8569e1597a0094b0f99a8Artem Belevich VLOG(2) << "Module '" << llvm_ir::AsString(module->getName()) 35066fc99a3b53c2e77d1c8569e1597a0094b0f99a8Artem Belevich << "' is empty. Skipping compilation."; 35166fc99a3b53c2e77d1c8569e1597a0094b0f99a8Artem Belevich return string(); 35266fc99a3b53c2e77d1c8569e1597a0094b0f99a8Artem Belevich } 3531e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Link the input module with libdevice, to pull in implementations of some 3541e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // builtins. 3557b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar TF_RETURN_IF_ERROR( 3567b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar LinkLibdeviceIfNecessary(module, compute_capability, libdevice_dir_path)); 3571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Set the flush-denormals-to-zero flag on the module so the NVVM reflect pass 3591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // can access it. 36012efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", 36112efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky hlo_module_config.debug_options().xla_gpu_ftz()); 3621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3631e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // If ftz is enabled, set it as an attribute on every function in the module. 36412efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky if (hlo_module_config.debug_options().xla_gpu_ftz()) { 3651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins for (llvm::Function& fn : *module) { 3661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins fn.addFnAttr("nvptx-f32ftz", "true"); 3671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 3681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 3691e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 37012efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky IrDumpingPassManager module_passes(module->getModuleIdentifier(), "", false); 3711e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3721e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Add an appropriate TargetLibraryInfo pass for the module's triple. 3731e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::TargetLibraryInfoWrapperPass* tliwp = 3741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins new llvm::TargetLibraryInfoWrapperPass( 3751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::Triple(module->getTargetTriple())); 3761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins module_passes.add(tliwp); 3771e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3781e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Try to fetch the target triple from the module. If not present, set a 3791e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // default target triple. 3801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::Triple target_triple = llvm::Triple(module->getTargetTriple()); 3811e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins if (target_triple.getArch() == llvm::Triple::UnknownArch) { 3821e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LOG(WARNING) << "target triple not found in the module"; 3831e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins target_triple = llvm::Triple("nvptx64-unknown-unknown"); 3841e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 3851e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3861e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Figure out the exact name of the processor as known to the NVPTX backend 3871e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // from the gpu_architecture flag. 3887b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar std::unique_ptr<llvm::TargetMachine> target_machine = GetTargetMachine( 3897b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar target_triple, GetSmName(compute_capability), hlo_module_config); 3901e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins module_passes.add(llvm::createTargetTransformInfoWrapperPass( 3911e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins target_machine->getTargetIRAnalysis())); 3921e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3931e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // The LLVM IR verifier performs sanity checking on the IR. This helps 3941e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // discover problems and report them in a meaningful manner, rather than let 39553cb26d05a5c2080d8022124178b1cc43a30ffe5A. Unique TensorFlower // later passes report obscure assertions because of unfulfilled invariants. 3961e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins module_passes.add(llvm::createVerifierPass()); 3971e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 3981e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Create the function-level pass manager. It needs data layout information 3991e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // too. 4001e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::legacy::FunctionPassManager function_passes(module); 4011e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 40212efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky int32 opt_level = 40312efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky hlo_module_config.debug_options().xla_backend_optimization_level(); 40412efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky 405ac47dc166f290d631c156846039ac78f30f362afEli Bendersky CHECK_GE(opt_level, 2) 406ac47dc166f290d631c156846039ac78f30f362afEli Bendersky << "The XLA GPU backend doesn't support unoptimized code generation"; 407ac47dc166f290d631c156846039ac78f30f362afEli Bendersky 40812efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky AddOptimizationPasses(opt_level, 40912efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky /*size_level=*/0, target_machine.get(), &module_passes, 41012efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky &function_passes); 41112efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky 4121b5235fd897f7ea5cffc715300f67b4dc852fa27Jonathan Hseu // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA 4131e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // again after the standard optimization passes [http://b/13329423]. 414d57572e996dce24abf4d9cf6ea04e7104b3d743bMartin Wicke // TODO(jingyue): SROA may further expose more optimization opportunities such 4151e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // as more precise alias analysis and more function inlining (SROA may change 4161e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // the inlining cost of a function). For now, running SROA already emits good 4171e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // enough code for the evaluated benchmarks. We may want to run more 4181e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // optimizations later. 41912efd3d0bbea953e52aee12eb5a3d5d2269ec16aEli Bendersky if (opt_level > 0) { 4201e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // LLVM's optimizer turns on SROA when the optimization level is greater 4211e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // than 0. We mimic this behavior here. 4221e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins module_passes.add(llvm::createSROAPass()); 4231e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 4241e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4251e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Verify that the module is well formed after optimizations ran. 4261e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins module_passes.add(llvm::createVerifierPass()); 4271e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4281e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Done populating the pass managers. Now run them. 4291e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4301e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins function_passes.doInitialization(); 4311e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins for (auto func = module->begin(); func != module->end(); ++func) { 4321e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins function_passes.run(*func); 4331e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 4341e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins function_passes.doFinalization(); 4351e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins module_passes.run(*module); 4361e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4371e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Finally, produce PTX. 4381e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins return EmitModuleToPTX(module, target_machine.get()); 4391e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 4401e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4411e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// One-time module initializer. 4421e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins// Must be called only once -- DO NOT CALL DIRECTLY. 4431e934ece7122cc623861a76ec3076f0dfb782225A. Unique TensorFlowervoid GPUBackendInit(const HloModuleConfig& hlo_module_config) { 4441e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Feed all customized flags here, so we can override them with llvm_cl_opts 4451e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // without redeploy the compiler for development purpose. 4461e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4471e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // This flag tunes a threshold in branch folding. The default threshold, which 4481e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // is one, is not suitable for CUDA programs where branches are more expensive 4491e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // than for CPU programs. Setting the threshold to 2 improves the latency of 4501e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // TwoDPatchDotProductKernel_IND_3_ND_48 by over 5%, and does not affect the 4511e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // latency of other benchmarks so far. 4521e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // 4531e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // I also tried setting this threshold to other values: 4541e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // * 3-6 gives similar results as 2; 4551e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // * >6 start hurting the performance of at least dot product kernels. 4561e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // 4571e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // TODO(jingyue): The current threshold only considers the numbr of IR 4581e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // instructions which do not accurately reflect the true cost. We need a 4591e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // better cost model. 4601e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins FeedLLVMWithFlags({"-bonus-inst-threshold=2"}); 4611e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // TODO(b/22073864): Increase limit when scan memory dependency. 4621e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // This helps to reduce more redundant load instructions. 4631e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // 4641e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // The specific value is currently large enough for s3d in shoc benchmark, 4651e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // which contains a lot of load instructions and many arithmetic instructions 4661e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // between those loads. 4671e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins FeedLLVMWithFlags({"-memdep-block-scan-limit=500"}); 4681e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4691e934ece7122cc623861a76ec3076f0dfb782225A. Unique TensorFlower llvm_ir::InitializeLLVMCommandLineOptions(hlo_module_config); 4701e934ece7122cc623861a76ec3076f0dfb782225A. Unique TensorFlower 4711e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Initialize the NVPTX target; it's the only target we link with, so call its 4721e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // specific initialization functions instead of the catch-all InitializeAll*. 4731e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LLVMInitializeNVPTXTarget(); 4741e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LLVMInitializeNVPTXTargetInfo(); 4751e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LLVMInitializeNVPTXTargetMC(); 4761e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins LLVMInitializeNVPTXAsmPrinter(); 4771e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4781e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins // Initialize the LLVM optimization passes. 4791e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry(); 4801e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins InitializePasses(registry); 4811e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 4821e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4831e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} // namespace 4841e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4851e67c90e2caceeff82d09793d1ef5fa0300d219bPeter HawkinsStatusOr<string> CompileToPtx(llvm::Module* module, 4867b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar std::pair<int, int> compute_capability, 487d45505fe0c7ab9a10f16682f54d0eb54c4776cd1Justin Lebar const HloModuleConfig& hlo_module_config, 4881e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins const string& libdevice_dir_path) { 4891e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins static std::once_flag backend_init_flag; 4901e934ece7122cc623861a76ec3076f0dfb782225A. Unique TensorFlower std::call_once(backend_init_flag, GPUBackendInit, hlo_module_config); 4911e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 4921e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins string ptx; 4931e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins { 4940bd46f52dad251846996bf440177128a16d429c2Artem Belevich tensorflow::port::Tracing::TraceMe annotation( 4950bd46f52dad251846996bf440177128a16d429c2Artem Belevich "Compiling IR", llvm_ir::AsString(module->getName()), 4960bd46f52dad251846996bf440177128a16d429c2Artem Belevich /*is_expensive=*/true); 497b525ea6798175f4c95996a3666c70de5c00a9a0cJustin Lebar XLA_SCOPED_LOGGING_TIMER("Compile module " + 498b525ea6798175f4c95996a3666c70de5c00a9a0cJustin Lebar llvm_ir::AsString(module->getName())); 499d45505fe0c7ab9a10f16682f54d0eb54c4776cd1Justin Lebar TF_ASSIGN_OR_RETURN( 5007b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar ptx, CompileModuleToPtx(module, compute_capability, hlo_module_config, 5017b02fa6a27022275517ed5b851b06ba19a11bdf0Justin Lebar libdevice_dir_path)); 5021e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins } 5031e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins return ptx; 5041e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} 5051e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins 5061e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} // namespace gpu 5071e67c90e2caceeff82d09793d1ef5fa0300d219bPeter Hawkins} // namespace xla 508