1/* 2 * Copyright 2012, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "bcc/Assert.h" 18#include "bcc/Renderscript/RSTransforms.h" 19#include "bcc/Renderscript/RSUtils.h" 20 21#include <cstdlib> 22#include <functional> 23#include <unordered_set> 24 25#include <llvm/IR/DerivedTypes.h> 26#include <llvm/IR/Function.h> 27#include <llvm/IR/Instructions.h> 28#include <llvm/IR/IRBuilder.h> 29#include <llvm/IR/MDBuilder.h> 30#include <llvm/IR/Module.h> 31#include <llvm/Pass.h> 32#include <llvm/Support/raw_ostream.h> 33#include <llvm/IR/DataLayout.h> 34#include <llvm/IR/Function.h> 35#include <llvm/IR/Type.h> 36#include <llvm/Transforms/Utils/BasicBlockUtils.h> 37 38#include "bcc/Config/Config.h" 39#include "bcc/Support/Log.h" 40 41#include "bcinfo/MetadataExtractor.h" 42 43#ifndef __DISABLE_ASSERTS 44// Only used in bccAssert() 45const int kNumExpandedForeachParams = 4; 46const int kNumExpandedReduceAccumulatorParams = 4; 47#endif 48 49const char kRenderScriptTBAARootName[] = "RenderScript Distinct TBAA"; 50const char kRenderScriptTBAANodeName[] = "RenderScript TBAA"; 51 52using namespace bcc; 53 54namespace { 55 56static const bool gEnableRsTbaa = true; 57 58/* RSKernelExpandPass - This pass operates on functions that are able 59 * to be called via rsForEach(), "foreach_<NAME>", or 60 * "reduce_<NAME>". We create an inner loop for the function to be 61 * invoked over the appropriate data cells of the input/output 62 * allocations (adjusting other relevant parameters as we go). We 63 * support doing this for any forEach or reduce style compute 64 * kernels. The new function name is the original function name 65 * followed by ".expand". Note that we still generate code for the 66 * original function. 67 */ 68class RSKernelExpandPass : public llvm::ModulePass { 69public: 70 static char ID; 71 72private: 73 static const size_t RS_KERNEL_INPUT_LIMIT = 8; // see frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h 74 75 typedef std::unordered_set<llvm::Function *> FunctionSet; 76 77 enum RsLaunchDimensionsField { 78 RsLaunchDimensionsFieldX, 79 RsLaunchDimensionsFieldY, 80 RsLaunchDimensionsFieldZ, 81 RsLaunchDimensionsFieldLod, 82 RsLaunchDimensionsFieldFace, 83 RsLaunchDimensionsFieldArray, 84 85 RsLaunchDimensionsFieldCount 86 }; 87 88 enum RsExpandKernelDriverInfoPfxField { 89 RsExpandKernelDriverInfoPfxFieldInPtr, 90 RsExpandKernelDriverInfoPfxFieldInStride, 91 RsExpandKernelDriverInfoPfxFieldInLen, 92 RsExpandKernelDriverInfoPfxFieldOutPtr, 93 RsExpandKernelDriverInfoPfxFieldOutStride, 94 RsExpandKernelDriverInfoPfxFieldOutLen, 95 RsExpandKernelDriverInfoPfxFieldDim, 96 RsExpandKernelDriverInfoPfxFieldCurrent, 97 RsExpandKernelDriverInfoPfxFieldUsr, 98 RsExpandKernelDriverInfoPfxFieldUsLenr, 99 100 RsExpandKernelDriverInfoPfxFieldCount 101 }; 102 103 llvm::Module *Module; 104 llvm::LLVMContext *Context; 105 106 /* 107 * Pointers to LLVM type information for the the function signatures 108 * for expanded functions. These must be re-calculated for each module 109 * the pass is run on. 110 */ 111 llvm::FunctionType *ExpandedForEachType; 112 llvm::Type *RsExpandKernelDriverInfoPfxTy; 113 114 uint32_t mExportForEachCount; 115 const char **mExportForEachNameList; 116 const uint32_t *mExportForEachSignatureList; 117 118 // Turns on optimization of allocation stride values. 119 bool mEnableStepOpt; 120 121 uint32_t getRootSignature(llvm::Function *Function) { 122 const llvm::NamedMDNode *ExportForEachMetadata = 123 Module->getNamedMetadata("#rs_export_foreach"); 124 125 if (!ExportForEachMetadata) { 126 llvm::SmallVector<llvm::Type*, 8> RootArgTys; 127 for (llvm::Function::arg_iterator B = Function->arg_begin(), 128 E = Function->arg_end(); 129 B != E; 130 ++B) { 131 RootArgTys.push_back(B->getType()); 132 } 133 134 // For pre-ICS bitcode, we may not have signature information. In that 135 // case, we use the size of the RootArgTys to select the number of 136 // arguments. 137 return (1 << RootArgTys.size()) - 1; 138 } 139 140 if (ExportForEachMetadata->getNumOperands() == 0) { 141 return 0; 142 } 143 144 bccAssert(ExportForEachMetadata->getNumOperands() > 0); 145 146 // We only handle the case for legacy root() functions here, so this is 147 // hard-coded to look at only the first such function. 148 llvm::MDNode *SigNode = ExportForEachMetadata->getOperand(0); 149 if (SigNode != nullptr && SigNode->getNumOperands() == 1) { 150 llvm::Metadata *SigMD = SigNode->getOperand(0); 151 if (llvm::MDString *SigS = llvm::dyn_cast<llvm::MDString>(SigMD)) { 152 llvm::StringRef SigString = SigS->getString(); 153 uint32_t Signature = 0; 154 if (SigString.getAsInteger(10, Signature)) { 155 ALOGE("Non-integer signature value '%s'", SigString.str().c_str()); 156 return 0; 157 } 158 return Signature; 159 } 160 } 161 162 return 0; 163 } 164 165 bool isStepOptSupported(llvm::Type *AllocType) { 166 167 llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType); 168 llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context); 169 170 if (mEnableStepOpt) { 171 return false; 172 } 173 174 if (AllocType == VoidPtrTy) { 175 return false; 176 } 177 178 if (!PT) { 179 return false; 180 } 181 182 // remaining conditions are 64-bit only 183 if (VoidPtrTy->getPrimitiveSizeInBits() == 32) { 184 return true; 185 } 186 187 // coerce suggests an upconverted struct type, which we can't support 188 if (AllocType->getStructName().find("coerce") != llvm::StringRef::npos) { 189 return false; 190 } 191 192 // 2xi64 and i128 suggest an upconverted struct type, which are also unsupported 193 llvm::Type *V2xi64Ty = llvm::VectorType::get(llvm::Type::getInt64Ty(*Context), 2); 194 llvm::Type *Int128Ty = llvm::Type::getIntNTy(*Context, 128); 195 if (AllocType == V2xi64Ty || AllocType == Int128Ty) { 196 return false; 197 } 198 199 return true; 200 } 201 202 // Get the actual value we should use to step through an allocation. 203 // 204 // Normally the value we use to step through an allocation is given to us by 205 // the driver. However, for certain primitive data types, we can derive an 206 // integer constant for the step value. We use this integer constant whenever 207 // possible to allow further compiler optimizations to take place. 208 // 209 // DL - Target Data size/layout information. 210 // T - Type of allocation (should be a pointer). 211 // OrigStep - Original step increment (root.expand() input from driver). 212 llvm::Value *getStepValue(llvm::DataLayout *DL, llvm::Type *AllocType, 213 llvm::Value *OrigStep) { 214 bccAssert(DL); 215 bccAssert(AllocType); 216 bccAssert(OrigStep); 217 llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType); 218 if (isStepOptSupported(AllocType)) { 219 llvm::Type *ET = PT->getElementType(); 220 uint64_t ETSize = DL->getTypeAllocSize(ET); 221 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 222 return llvm::ConstantInt::get(Int32Ty, ETSize); 223 } else { 224 return OrigStep; 225 } 226 } 227 228 /// Builds the types required by the pass for the given context. 229 void buildTypes(void) { 230 // Create the RsLaunchDimensionsTy and RsExpandKernelDriverInfoPfxTy structs. 231 232 llvm::Type *Int8Ty = llvm::Type::getInt8Ty(*Context); 233 llvm::Type *Int8PtrTy = Int8Ty->getPointerTo(); 234 llvm::Type *Int8PtrArrayInputLimitTy = llvm::ArrayType::get(Int8PtrTy, RS_KERNEL_INPUT_LIMIT); 235 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 236 llvm::Type *Int32ArrayInputLimitTy = llvm::ArrayType::get(Int32Ty, RS_KERNEL_INPUT_LIMIT); 237 llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context); 238 llvm::Type *Int32Array4Ty = llvm::ArrayType::get(Int32Ty, 4); 239 240 /* Defined in frameworks/base/libs/rs/cpu_ref/rsCpuCore.h: 241 * 242 * struct RsLaunchDimensions { 243 * uint32_t x; 244 * uint32_t y; 245 * uint32_t z; 246 * uint32_t lod; 247 * uint32_t face; 248 * uint32_t array[4]; 249 * }; 250 */ 251 llvm::SmallVector<llvm::Type*, RsLaunchDimensionsFieldCount> RsLaunchDimensionsTypes; 252 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t x 253 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t y 254 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t z 255 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t lod 256 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t face 257 RsLaunchDimensionsTypes.push_back(Int32Array4Ty); // uint32_t array[4] 258 llvm::StructType *RsLaunchDimensionsTy = 259 llvm::StructType::create(RsLaunchDimensionsTypes, "RsLaunchDimensions"); 260 261 /* Defined as the beginning of RsExpandKernelDriverInfo in frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h: 262 * 263 * struct RsExpandKernelDriverInfoPfx { 264 * const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT]; 265 * uint32_t inStride[RS_KERNEL_INPUT_LIMIT]; 266 * uint32_t inLen; 267 * 268 * uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT]; 269 * uint32_t outStride[RS_KERNEL_INPUT_LIMIT]; 270 * uint32_t outLen; 271 * 272 * // Dimension of the launch 273 * RsLaunchDimensions dim; 274 * 275 * // The walking iterator of the launch 276 * RsLaunchDimensions current; 277 * 278 * const void *usr; 279 * uint32_t usrLen; 280 * 281 * // Items below this line are not used by the compiler and can be change in the driver. 282 * // So the compiler must assume there are an unknown number of fields of unknown type 283 * // beginning here. 284 * }; 285 * 286 * The name "RsExpandKernelDriverInfoPfx" is known to RSInvariantPass (RSInvariant.cpp). 287 */ 288 llvm::SmallVector<llvm::Type*, RsExpandKernelDriverInfoPfxFieldCount> RsExpandKernelDriverInfoPfxTypes; 289 RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT] 290 RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy); // uint32_t inStride[RS_KERNEL_INPUT_LIMIT] 291 RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t inLen 292 RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT] 293 RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy); // uint32_t outStride[RS_KERNEL_INPUT_LIMIT] 294 RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t outLen 295 RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy); // RsLaunchDimensions dim 296 RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy); // RsLaunchDimensions current 297 RsExpandKernelDriverInfoPfxTypes.push_back(VoidPtrTy); // const void *usr 298 RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t usrLen 299 RsExpandKernelDriverInfoPfxTy = 300 llvm::StructType::create(RsExpandKernelDriverInfoPfxTypes, "RsExpandKernelDriverInfoPfx"); 301 302 // Create the function type for expanded kernels. 303 llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context); 304 305 llvm::Type *RsExpandKernelDriverInfoPfxPtrTy = RsExpandKernelDriverInfoPfxTy->getPointerTo(); 306 // void (const RsExpandKernelDriverInfoPfxTy *p, uint32_t x1, uint32_t x2, uint32_t outstep) 307 ExpandedForEachType = llvm::FunctionType::get(VoidTy, 308 {RsExpandKernelDriverInfoPfxPtrTy, Int32Ty, Int32Ty, Int32Ty}, false); 309 } 310 311 /// @brief Create skeleton of the expanded foreach kernel. 312 /// 313 /// This creates a function with the following signature: 314 /// 315 /// void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2, 316 /// uint32_t outstep) 317 /// 318 llvm::Function *createEmptyExpandedForEachKernel(llvm::StringRef OldName) { 319 llvm::Function *ExpandedFunction = 320 llvm::Function::Create(ExpandedForEachType, 321 llvm::GlobalValue::ExternalLinkage, 322 OldName + ".expand", Module); 323 bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams); 324 llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin(); 325 (AI++)->setName("p"); 326 (AI++)->setName("x1"); 327 (AI++)->setName("x2"); 328 (AI++)->setName("arg_outstep"); 329 llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin", 330 ExpandedFunction); 331 llvm::IRBuilder<> Builder(Begin); 332 Builder.CreateRetVoid(); 333 return ExpandedFunction; 334 } 335 336 // Create skeleton of a general reduce kernel's expanded accumulator. 337 // 338 // This creates a function with the following signature: 339 // 340 // void @func.expand(%RsExpandKernelDriverInfoPfx* nocapture %p, 341 // i32 %x1, i32 %x2, accumType* nocapture %accum) 342 // 343 llvm::Function *createEmptyExpandedReduceAccumulator(llvm::StringRef OldName, 344 llvm::Type *AccumArgTy) { 345 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 346 llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context); 347 llvm::FunctionType *ExpandedReduceAccumulatorType = 348 llvm::FunctionType::get(VoidTy, 349 {RsExpandKernelDriverInfoPfxTy->getPointerTo(), 350 Int32Ty, Int32Ty, AccumArgTy}, false); 351 llvm::Function *FnExpandedAccumulator = 352 llvm::Function::Create(ExpandedReduceAccumulatorType, 353 llvm::GlobalValue::ExternalLinkage, 354 OldName + ".expand", Module); 355 bccAssert(FnExpandedAccumulator->arg_size() == kNumExpandedReduceAccumulatorParams); 356 357 llvm::Function::arg_iterator AI = FnExpandedAccumulator->arg_begin(); 358 359 using llvm::Attribute; 360 361 llvm::Argument *Arg_p = &(*AI++); 362 Arg_p->setName("p"); 363 Arg_p->addAttr(llvm::AttributeSet::get(*Context, Arg_p->getArgNo() + 1, 364 llvm::makeArrayRef(Attribute::NoCapture))); 365 366 llvm::Argument *Arg_x1 = &(*AI++); 367 Arg_x1->setName("x1"); 368 369 llvm::Argument *Arg_x2 = &(*AI++); 370 Arg_x2->setName("x2"); 371 372 llvm::Argument *Arg_accum = &(*AI++); 373 Arg_accum->setName("accum"); 374 Arg_accum->addAttr(llvm::AttributeSet::get(*Context, Arg_accum->getArgNo() + 1, 375 llvm::makeArrayRef(Attribute::NoCapture))); 376 377 llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin", 378 FnExpandedAccumulator); 379 llvm::IRBuilder<> Builder(Begin); 380 Builder.CreateRetVoid(); 381 382 return FnExpandedAccumulator; 383 } 384 385 /// @brief Create an empty loop 386 /// 387 /// Create a loop of the form: 388 /// 389 /// for (i = LowerBound; i < UpperBound; i++) 390 /// ; 391 /// 392 /// After the loop has been created, the builder is set such that 393 /// instructions can be added to the loop body. 394 /// 395 /// @param Builder The builder to use to build this loop. The current 396 /// position of the builder is the position the loop 397 /// will be inserted. 398 /// @param LowerBound The first value of the loop iterator 399 /// @param UpperBound The maximal value of the loop iterator 400 /// @param LoopIV A reference that will be set to the loop iterator. 401 /// @return The BasicBlock that will be executed after the loop. 402 llvm::BasicBlock *createLoop(llvm::IRBuilder<> &Builder, 403 llvm::Value *LowerBound, 404 llvm::Value *UpperBound, 405 llvm::Value **LoopIV) { 406 bccAssert(LowerBound->getType() == UpperBound->getType()); 407 408 llvm::BasicBlock *CondBB, *AfterBB, *HeaderBB; 409 llvm::Value *Cond, *IVNext, *IV, *IVVar; 410 411 CondBB = Builder.GetInsertBlock(); 412 AfterBB = llvm::SplitBlock(CondBB, &*Builder.GetInsertPoint(), nullptr, nullptr); 413 HeaderBB = llvm::BasicBlock::Create(*Context, "Loop", CondBB->getParent()); 414 415 CondBB->getTerminator()->eraseFromParent(); 416 Builder.SetInsertPoint(CondBB); 417 418 // decltype(LowerBound) *ivvar = alloca(sizeof(int)) 419 // *ivvar = LowerBound 420 IVVar = Builder.CreateAlloca(LowerBound->getType(), nullptr, BCC_INDEX_VAR_NAME); 421 Builder.CreateStore(LowerBound, IVVar); 422 423 // if (LowerBound < Upperbound) 424 // goto LoopHeader 425 // else 426 // goto AfterBB 427 Cond = Builder.CreateICmpULT(LowerBound, UpperBound); 428 Builder.CreateCondBr(Cond, HeaderBB, AfterBB); 429 430 // LoopHeader: 431 // iv = *ivvar 432 // <insertion point here> 433 // iv.next = iv + 1 434 // *ivvar = iv.next 435 // if (iv.next < Upperbound) 436 // goto LoopHeader 437 // else 438 // goto AfterBB 439 // AfterBB: 440 Builder.SetInsertPoint(HeaderBB); 441 IV = Builder.CreateLoad(IVVar, "X"); 442 IVNext = Builder.CreateNUWAdd(IV, Builder.getInt32(1)); 443 Builder.CreateStore(IVNext, IVVar); 444 Cond = Builder.CreateICmpULT(IVNext, UpperBound); 445 Builder.CreateCondBr(Cond, HeaderBB, AfterBB); 446 AfterBB->setName("Exit"); 447 Builder.SetInsertPoint(llvm::cast<llvm::Instruction>(IVNext)); 448 449 // Record information about this loop. 450 *LoopIV = IV; 451 return AfterBB; 452 } 453 454 // Finish building the outgoing argument list for calling a ForEach-able function. 455 // 456 // ArgVector - on input, the non-special arguments 457 // on output, the non-special arguments combined with the special arguments 458 // from SpecialArgVector 459 // SpecialArgVector - special arguments (from ExpandSpecialArguments()) 460 // SpecialArgContextIdx - return value of ExpandSpecialArguments() 461 // (position of context argument in SpecialArgVector) 462 // CalleeFunction - the ForEach-able function being called 463 // Builder - for inserting code into the caller function 464 template<unsigned int ArgVectorLen, unsigned int SpecialArgVectorLen> 465 void finishArgList( llvm::SmallVector<llvm::Value *, ArgVectorLen> &ArgVector, 466 const llvm::SmallVector<llvm::Value *, SpecialArgVectorLen> &SpecialArgVector, 467 const int SpecialArgContextIdx, 468 const llvm::Function &CalleeFunction, 469 llvm::IRBuilder<> &CallerBuilder) { 470 /* The context argument (if any) is a pointer to an opaque user-visible type that differs from 471 * the RsExpandKernelDriverInfoPfx type used in the function we are generating (although the 472 * two types represent the same thing). Therefore, we must introduce a pointer cast when 473 * generating a call to the kernel function. 474 */ 475 const int ArgContextIdx = 476 SpecialArgContextIdx >= 0 ? (ArgVector.size() + SpecialArgContextIdx) : SpecialArgContextIdx; 477 ArgVector.append(SpecialArgVector.begin(), SpecialArgVector.end()); 478 if (ArgContextIdx >= 0) { 479 llvm::Type *ContextArgType = nullptr; 480 int ArgIdx = ArgContextIdx; 481 for (const auto &Arg : CalleeFunction.getArgumentList()) { 482 if (!ArgIdx--) { 483 ContextArgType = Arg.getType(); 484 break; 485 } 486 } 487 bccAssert(ContextArgType); 488 ArgVector[ArgContextIdx] = CallerBuilder.CreatePointerCast(ArgVector[ArgContextIdx], ContextArgType); 489 } 490 } 491 492 // GEPHelper() returns a SmallVector of values suitable for passing 493 // to IRBuilder::CreateGEP(), and SmallGEPIndices is a typedef for 494 // the returned data type. It is sized so that the SmallVector 495 // returned by GEPHelper() never needs to do a heap allocation for 496 // any list of GEP indices it encounters in the code. 497 typedef llvm::SmallVector<llvm::Value *, 3> SmallGEPIndices; 498 499 // Helper for turning a list of constant integer GEP indices into a 500 // SmallVector of llvm::Value*. The return value is suitable for 501 // passing to a GetElementPtrInst constructor or IRBuilder::CreateGEP(). 502 // 503 // Inputs: 504 // I32Args should be integers which represent the index arguments 505 // to a GEP instruction. 506 // 507 // Returns: 508 // Returns a SmallVector of ConstantInts. 509 SmallGEPIndices GEPHelper(const std::initializer_list<int32_t> I32Args) { 510 SmallGEPIndices Out(I32Args.size()); 511 llvm::IntegerType *I32Ty = llvm::Type::getInt32Ty(*Context); 512 std::transform(I32Args.begin(), I32Args.end(), Out.begin(), 513 [I32Ty](int32_t Arg) { return llvm::ConstantInt::get(I32Ty, Arg); }); 514 return Out; 515 } 516 517public: 518 RSKernelExpandPass(bool pEnableStepOpt = true) 519 : ModulePass(ID), Module(nullptr), Context(nullptr), 520 mEnableStepOpt(pEnableStepOpt) { 521 522 } 523 524 virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override { 525 // This pass does not use any other analysis passes, but it does 526 // add/wrap the existing functions in the module (thus altering the CFG). 527 } 528 529 // Build contribution to outgoing argument list for calling a 530 // ForEach-able function or a general reduction accumulator 531 // function, based on the special parameters of that function. 532 // 533 // Signature - metadata bits for the signature of the callee 534 // X, Arg_p - values derived directly from expanded function, 535 // suitable for computing arguments for the callee 536 // CalleeArgs - contribution is accumulated here 537 // Bump - invoked once for each contributed outgoing argument 538 // LoopHeaderInsertionPoint - an Instruction in the loop header, before which 539 // this function can insert loop-invariant loads 540 // 541 // Return value is the (zero-based) position of the context (Arg_p) 542 // argument in the CalleeArgs vector, or a negative value if the 543 // context argument is not placed in the CalleeArgs vector. 544 int ExpandSpecialArguments(uint32_t Signature, 545 llvm::Value *X, 546 llvm::Value *Arg_p, 547 llvm::IRBuilder<> &Builder, 548 llvm::SmallVector<llvm::Value*, 8> &CalleeArgs, 549 std::function<void ()> Bump, 550 llvm::Instruction *LoopHeaderInsertionPoint) { 551 552 bccAssert(CalleeArgs.empty()); 553 554 int Return = -1; 555 if (bcinfo::MetadataExtractor::hasForEachSignatureCtxt(Signature)) { 556 CalleeArgs.push_back(Arg_p); 557 Bump(); 558 Return = CalleeArgs.size() - 1; 559 } 560 561 if (bcinfo::MetadataExtractor::hasForEachSignatureX(Signature)) { 562 CalleeArgs.push_back(X); 563 Bump(); 564 } 565 566 if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature) || 567 bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) { 568 bccAssert(LoopHeaderInsertionPoint); 569 570 // Y and Z are loop invariant, so they can be hoisted out of the 571 // loop. Set the IRBuilder insertion point to the loop header. 572 auto OldInsertionPoint = Builder.saveIP(); 573 Builder.SetInsertPoint(LoopHeaderInsertionPoint); 574 575 if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature)) { 576 SmallGEPIndices YValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent, 577 RsLaunchDimensionsFieldY})); 578 llvm::Value *YAddr = Builder.CreateInBoundsGEP(Arg_p, YValueGEP, "Y.gep"); 579 CalleeArgs.push_back(Builder.CreateLoad(YAddr, "Y")); 580 Bump(); 581 } 582 583 if (bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) { 584 SmallGEPIndices ZValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent, 585 RsLaunchDimensionsFieldZ})); 586 llvm::Value *ZAddr = Builder.CreateInBoundsGEP(Arg_p, ZValueGEP, "Z.gep"); 587 CalleeArgs.push_back(Builder.CreateLoad(ZAddr, "Z")); 588 Bump(); 589 } 590 591 Builder.restoreIP(OldInsertionPoint); 592 } 593 594 return Return; 595 } 596 597 // Generate loop-invariant input processing setup code for an expanded 598 // ForEach-able function or an expanded general reduction accumulator 599 // function. 600 // 601 // LoopHeader - block at the end of which the setup code will be inserted 602 // Arg_p - RSKernelDriverInfo pointer passed to the expanded function 603 // TBAAPointer - metadata for marking loads of pointer values out of RSKernelDriverInfo 604 // ArgIter - iterator pointing to first input of the UNexpanded function 605 // NumInputs - number of inputs (NOT number of ARGUMENTS) 606 // 607 // InTypes[] - this function saves input type, they will be used in ExpandInputsBody(). 608 // InBufPtrs[] - this function sets each array element to point to the first cell / byte 609 // (byte for x86, cell for other platforms) of the corresponding input allocation 610 // InStructTempSlots[] - this function sets each array element either to nullptr 611 // or to the result of an alloca (for the case where the 612 // calling convention dictates that a value must be passed 613 // by reference, and so we need a stacked temporary to hold 614 // a copy of that value) 615 void ExpandInputsLoopInvariant(llvm::IRBuilder<> &Builder, llvm::BasicBlock *LoopHeader, 616 llvm::Value *Arg_p, 617 llvm::MDNode *TBAAPointer, 618 llvm::Function::arg_iterator ArgIter, 619 const size_t NumInputs, 620 llvm::SmallVectorImpl<llvm::Type *> &InTypes, 621 llvm::SmallVectorImpl<llvm::Value *> &InBufPtrs, 622 llvm::SmallVectorImpl<llvm::Value *> &InStructTempSlots) { 623 bccAssert(NumInputs <= RS_KERNEL_INPUT_LIMIT); 624 625 // Extract information about input slots. The work done 626 // here is loop-invariant, so we can hoist the operations out of the loop. 627 auto OldInsertionPoint = Builder.saveIP(); 628 Builder.SetInsertPoint(LoopHeader->getTerminator()); 629 630 for (size_t InputIndex = 0; InputIndex < NumInputs; ++InputIndex, ArgIter++) { 631 llvm::Type *InType = ArgIter->getType(); 632 633 /* 634 * AArch64 calling conventions dictate that structs of sufficient size 635 * get passed by pointer instead of passed by value. This, combined 636 * with the fact that we don't allow kernels to operate on pointer 637 * data means that if we see a kernel with a pointer parameter we know 638 * that it is a struct input that has been promoted. As such we don't 639 * need to convert its type to a pointer. Later we will need to know 640 * to create a temporary copy on the stack, so we save this information 641 * in InStructTempSlots. 642 */ 643 if (auto PtrType = llvm::dyn_cast<llvm::PointerType>(InType)) { 644 llvm::Type *ElementType = PtrType->getElementType(); 645 InStructTempSlots.push_back(Builder.CreateAlloca(ElementType, nullptr, 646 "input_struct_slot")); 647 } else { 648 InType = InType->getPointerTo(); 649 InStructTempSlots.push_back(nullptr); 650 } 651 652 SmallGEPIndices InBufPtrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr, 653 static_cast<int32_t>(InputIndex)})); 654 llvm::Value *InBufPtrAddr = Builder.CreateInBoundsGEP(Arg_p, InBufPtrGEP, "input_buf.gep"); 655 llvm::LoadInst *InBufPtr = Builder.CreateLoad(InBufPtrAddr, "input_buf"); 656 657 llvm::Value *CastInBufPtr = nullptr; 658 if (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING) { 659 CastInBufPtr = Builder.CreatePointerCast(InBufPtr, InType, "casted_in"); 660 } else { 661 // The disagreement between module and x86 target machine datalayout 662 // causes mismatched input/output data offset between slang reflected 663 // code and bcc codegen for GetElementPtr. To solve this issue, skip the 664 // cast to InType and leave CastInBufPtr as an int8_t*. The buffer is 665 // later indexed with an explicit byte offset computed based on 666 // X86_CUSTOM_DL_STRING and then bitcast it to actual input type. 667 CastInBufPtr = InBufPtr; 668 } 669 670 if (gEnableRsTbaa) { 671 InBufPtr->setMetadata("tbaa", TBAAPointer); 672 } 673 674 InTypes.push_back(InType); 675 InBufPtrs.push_back(CastInBufPtr); 676 } 677 678 Builder.restoreIP(OldInsertionPoint); 679 } 680 681 // Generate loop-varying input processing code for an expanded ForEach-able function 682 // or an expanded general reduction accumulator function. Also, for the call to the 683 // UNexpanded function, collect the portion of the argument list corresponding to the 684 // inputs. 685 // 686 // Arg_x1 - first X coordinate to be processed by the expanded function 687 // TBAAAllocation - metadata for marking loads of input values out of allocations 688 // NumInputs -- number of inputs (NOT number of ARGUMENTS) 689 // InTypes[] - this function uses the saved input types in ExpandInputsLoopInvariant() 690 // to convert the pointer of byte InPtr to its real type. 691 // InBufPtrs[] - this function consumes the information produced by ExpandInputsLoopInvariant() 692 // InStructTempSlots[] - this function consumes the information produced by ExpandInputsLoopInvariant() 693 // IndVar - value of loop induction variable (X coordinate) for a given loop iteration 694 // 695 // RootArgs - this function sets this to the list of outgoing argument values corresponding 696 // to the inputs 697 void ExpandInputsBody(llvm::IRBuilder<> &Builder, 698 llvm::Value *Arg_x1, 699 llvm::MDNode *TBAAAllocation, 700 const size_t NumInputs, 701 const llvm::SmallVectorImpl<llvm::Type *> &InTypes, 702 const llvm::SmallVectorImpl<llvm::Value *> &InBufPtrs, 703 const llvm::SmallVectorImpl<llvm::Value *> &InStructTempSlots, 704 llvm::Value *IndVar, 705 llvm::SmallVectorImpl<llvm::Value *> &RootArgs) { 706 llvm::Value *Offset = Builder.CreateSub(IndVar, Arg_x1); 707 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 708 709 for (size_t Index = 0; Index < NumInputs; ++Index) { 710 711 llvm::Value *InPtr = nullptr; 712 if (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING) { 713 InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], Offset); 714 } else { 715 // Treat x86 input buffer as byte[], get indexed pointer with explicit 716 // byte offset computed using a datalayout based on 717 // X86_CUSTOM_DL_STRING, then bitcast it to actual input type. 718 llvm::DataLayout DL(X86_CUSTOM_DL_STRING); 719 llvm::Type *InTy = InTypes[Index]; 720 uint64_t InStep = DL.getTypeAllocSize(InTy->getPointerElementType()); 721 llvm::Value *OffsetInBytes = Builder.CreateMul(Offset, llvm::ConstantInt::get(Int32Ty, InStep)); 722 InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], OffsetInBytes); 723 InPtr = Builder.CreatePointerCast(InPtr, InTy); 724 } 725 726 llvm::Value *Input; 727 llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input"); 728 729 if (gEnableRsTbaa) { 730 InputLoad->setMetadata("tbaa", TBAAAllocation); 731 } 732 733 if (llvm::Value *TemporarySlot = InStructTempSlots[Index]) { 734 // Pass a pointer to a temporary on the stack, rather than 735 // passing a pointer to the original value. We do not want 736 // the kernel to potentially modify the input data. 737 738 // Note: don't annotate with TBAA, since the kernel might 739 // have its own TBAA annotations for the pointer argument. 740 Builder.CreateStore(InputLoad, TemporarySlot); 741 Input = TemporarySlot; 742 } else { 743 Input = InputLoad; 744 } 745 746 RootArgs.push_back(Input); 747 } 748 } 749 750 /* Performs the actual optimization on a selected function. On success, the 751 * Module will contain a new function of the name "<NAME>.expand" that 752 * invokes <NAME>() in a loop with the appropriate parameters. 753 */ 754 bool ExpandOldStyleForEach(llvm::Function *Function, uint32_t Signature) { 755 ALOGV("Expanding ForEach-able Function %s", 756 Function->getName().str().c_str()); 757 758 if (!Signature) { 759 Signature = getRootSignature(Function); 760 if (!Signature) { 761 // We couldn't determine how to expand this function based on its 762 // function signature. 763 return false; 764 } 765 } 766 767 llvm::DataLayout DL(Module); 768 if (Module->getTargetTriple() == DEFAULT_X86_TRIPLE_STRING) { 769 DL.reset(X86_CUSTOM_DL_STRING); 770 } 771 772 llvm::Function *ExpandedFunction = 773 createEmptyExpandedForEachKernel(Function->getName()); 774 775 /* 776 * Extract the expanded function's parameters. It is guaranteed by 777 * createEmptyExpandedForEachKernel that there will be four parameters. 778 */ 779 780 bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams); 781 782 llvm::Function::arg_iterator ExpandedFunctionArgIter = 783 ExpandedFunction->arg_begin(); 784 785 llvm::Value *Arg_p = &*(ExpandedFunctionArgIter++); 786 llvm::Value *Arg_x1 = &*(ExpandedFunctionArgIter++); 787 llvm::Value *Arg_x2 = &*(ExpandedFunctionArgIter++); 788 llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter); 789 790 llvm::Value *InStep = nullptr; 791 llvm::Value *OutStep = nullptr; 792 793 // Construct the actual function body. 794 llvm::IRBuilder<> Builder(&*ExpandedFunction->getEntryBlock().begin()); 795 796 // Collect and construct the arguments for the kernel(). 797 // Note that we load any loop-invariant arguments before entering the Loop. 798 llvm::Function::arg_iterator FunctionArgIter = Function->arg_begin(); 799 800 llvm::Type *InTy = nullptr; 801 llvm::Value *InBufPtr = nullptr; 802 if (bcinfo::MetadataExtractor::hasForEachSignatureIn(Signature)) { 803 SmallGEPIndices InStepGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInStride, 0})); 804 llvm::LoadInst *InStepArg = Builder.CreateLoad( 805 Builder.CreateInBoundsGEP(Arg_p, InStepGEP, "instep_addr.gep"), "instep_addr"); 806 807 InTy = (FunctionArgIter++)->getType(); 808 InStep = getStepValue(&DL, InTy, InStepArg); 809 810 InStep->setName("instep"); 811 812 SmallGEPIndices InputAddrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr, 0})); 813 InBufPtr = Builder.CreateLoad( 814 Builder.CreateInBoundsGEP(Arg_p, InputAddrGEP, "input_buf.gep"), "input_buf"); 815 } 816 817 llvm::Type *OutTy = nullptr; 818 llvm::Value *OutBasePtr = nullptr; 819 if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) { 820 OutTy = (FunctionArgIter++)->getType(); 821 OutStep = getStepValue(&DL, OutTy, Arg_outstep); 822 OutStep->setName("outstep"); 823 SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0})); 824 OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep")); 825 } 826 827 llvm::Value *UsrData = nullptr; 828 if (bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)) { 829 llvm::Type *UsrDataTy = (FunctionArgIter++)->getType(); 830 llvm::Value *UsrDataPointerAddr = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldUsr); 831 UsrData = Builder.CreatePointerCast(Builder.CreateLoad(UsrDataPointerAddr), UsrDataTy); 832 UsrData->setName("UsrData"); 833 } 834 835 llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock(); 836 llvm::Value *IV; 837 createLoop(Builder, Arg_x1, Arg_x2, &IV); 838 839 llvm::SmallVector<llvm::Value*, 8> CalleeArgs; 840 const int CalleeArgsContextIdx = ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs, 841 [&FunctionArgIter]() { FunctionArgIter++; }, 842 LoopHeader->getTerminator()); 843 844 bccAssert(FunctionArgIter == Function->arg_end()); 845 846 // Populate the actual call to kernel(). 847 llvm::SmallVector<llvm::Value*, 8> RootArgs; 848 849 llvm::Value *InPtr = nullptr; 850 llvm::Value *OutPtr = nullptr; 851 852 // Calculate the current input and output pointers 853 // 854 // We always calculate the input/output pointers with a GEP operating on i8 855 // values and only cast at the very end to OutTy. This is because the step 856 // between two values is given in bytes. 857 // 858 // TODO: We could further optimize the output by using a GEP operation of 859 // type 'OutTy' in cases where the element type of the allocation allows. 860 if (OutBasePtr) { 861 llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1); 862 OutOffset = Builder.CreateMul(OutOffset, OutStep); 863 OutPtr = Builder.CreateInBoundsGEP(OutBasePtr, OutOffset); 864 OutPtr = Builder.CreatePointerCast(OutPtr, OutTy); 865 } 866 867 if (InBufPtr) { 868 llvm::Value *InOffset = Builder.CreateSub(IV, Arg_x1); 869 InOffset = Builder.CreateMul(InOffset, InStep); 870 InPtr = Builder.CreateInBoundsGEP(InBufPtr, InOffset); 871 InPtr = Builder.CreatePointerCast(InPtr, InTy); 872 } 873 874 if (InPtr) { 875 RootArgs.push_back(InPtr); 876 } 877 878 if (OutPtr) { 879 RootArgs.push_back(OutPtr); 880 } 881 882 if (UsrData) { 883 RootArgs.push_back(UsrData); 884 } 885 886 finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder); 887 888 Builder.CreateCall(Function, RootArgs); 889 890 return true; 891 } 892 893 /* Expand a pass-by-value foreach kernel. 894 */ 895 bool ExpandForEach(llvm::Function *Function, uint32_t Signature) { 896 bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)); 897 ALOGV("Expanding kernel Function %s", Function->getName().str().c_str()); 898 899 // TODO: Refactor this to share functionality with ExpandOldStyleForEach. 900 llvm::DataLayout DL(Module); 901 if (Module->getTargetTriple() == DEFAULT_X86_TRIPLE_STRING) { 902 DL.reset(X86_CUSTOM_DL_STRING); 903 } 904 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 905 906 llvm::Function *ExpandedFunction = 907 createEmptyExpandedForEachKernel(Function->getName()); 908 909 /* 910 * Extract the expanded function's parameters. It is guaranteed by 911 * createEmptyExpandedForEachKernel that there will be four parameters. 912 */ 913 914 bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams); 915 916 llvm::Function::arg_iterator ExpandedFunctionArgIter = 917 ExpandedFunction->arg_begin(); 918 919 llvm::Value *Arg_p = &*(ExpandedFunctionArgIter++); 920 llvm::Value *Arg_x1 = &*(ExpandedFunctionArgIter++); 921 llvm::Value *Arg_x2 = &*(ExpandedFunctionArgIter++); 922 // Arg_outstep is not used by expanded new-style forEach kernels. 923 924 // Construct the actual function body. 925 llvm::IRBuilder<> Builder(&*ExpandedFunction->getEntryBlock().begin()); 926 927 // Create TBAA meta-data. 928 llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, 929 *TBAAAllocation, *TBAAPointer; 930 llvm::MDBuilder MDHelper(*Context); 931 932 TBAARenderScriptDistinct = 933 MDHelper.createTBAARoot(kRenderScriptTBAARootName); 934 TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName, 935 TBAARenderScriptDistinct); 936 TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation", 937 TBAARenderScript); 938 TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation, 939 TBAAAllocation, 0); 940 TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer", 941 TBAARenderScript); 942 TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0); 943 944 /* 945 * Collect and construct the arguments for the kernel(). 946 * 947 * Note that we load any loop-invariant arguments before entering the Loop. 948 */ 949 size_t NumRemainingInputs = Function->arg_size(); 950 951 // No usrData parameter on kernels. 952 bccAssert( 953 !bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)); 954 955 llvm::Function::arg_iterator ArgIter = Function->arg_begin(); 956 957 // Check the return type 958 llvm::Type *OutTy = nullptr; 959 llvm::LoadInst *OutBasePtr = nullptr; 960 llvm::Value *CastedOutBasePtr = nullptr; 961 962 bool PassOutByPointer = false; 963 964 if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) { 965 llvm::Type *OutBaseTy = Function->getReturnType(); 966 967 if (OutBaseTy->isVoidTy()) { 968 PassOutByPointer = true; 969 OutTy = ArgIter->getType(); 970 971 ArgIter++; 972 --NumRemainingInputs; 973 } else { 974 // We don't increment Args, since we are using the actual return type. 975 OutTy = OutBaseTy->getPointerTo(); 976 } 977 978 SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0})); 979 OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep")); 980 981 if (gEnableRsTbaa) { 982 OutBasePtr->setMetadata("tbaa", TBAAPointer); 983 } 984 985 if (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING) { 986 CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out"); 987 } else { 988 // The disagreement between module and x86 target machine datalayout 989 // causes mismatched input/output data offset between slang reflected 990 // code and bcc codegen for GetElementPtr. To solve this issue, skip the 991 // cast to OutTy and leave CastedOutBasePtr as an int8_t*. The buffer 992 // is later indexed with an explicit byte offset computed based on 993 // X86_CUSTOM_DL_STRING and then bitcast it to actual output type. 994 CastedOutBasePtr = OutBasePtr; 995 } 996 } 997 998 llvm::SmallVector<llvm::Type*, 8> InTypes; 999 llvm::SmallVector<llvm::Value*, 8> InBufPtrs; 1000 llvm::SmallVector<llvm::Value*, 8> InStructTempSlots; 1001 1002 bccAssert(NumRemainingInputs <= RS_KERNEL_INPUT_LIMIT); 1003 1004 // Create the loop structure. 1005 llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock(); 1006 llvm::Value *IV; 1007 createLoop(Builder, Arg_x1, Arg_x2, &IV); 1008 1009 llvm::SmallVector<llvm::Value*, 8> CalleeArgs; 1010 const int CalleeArgsContextIdx = 1011 ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs, 1012 [&NumRemainingInputs]() { --NumRemainingInputs; }, 1013 LoopHeader->getTerminator()); 1014 1015 // After ExpandSpecialArguments() gets called, NumRemainingInputs 1016 // counts the number of arguments to the kernel that correspond to 1017 // an array entry from the InPtr field of the DriverInfo 1018 // structure. 1019 const size_t NumInPtrArguments = NumRemainingInputs; 1020 1021 if (NumInPtrArguments > 0) { 1022 ExpandInputsLoopInvariant(Builder, LoopHeader, Arg_p, TBAAPointer, ArgIter, NumInPtrArguments, 1023 InTypes, InBufPtrs, InStructTempSlots); 1024 } 1025 1026 // Populate the actual call to kernel(). 1027 llvm::SmallVector<llvm::Value*, 8> RootArgs; 1028 1029 // Calculate the current input and output pointers. 1030 1031 // Output 1032 1033 llvm::Value *OutPtr = nullptr; 1034 if (CastedOutBasePtr) { 1035 llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1); 1036 1037 if (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING) { 1038 OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffset); 1039 } else { 1040 // Treat x86 output buffer as byte[], get indexed pointer with explicit 1041 // byte offset computed using a datalayout based on 1042 // X86_CUSTOM_DL_STRING, then bitcast it to actual output type. 1043 uint64_t OutStep = DL.getTypeAllocSize(OutTy->getPointerElementType()); 1044 llvm::Value *OutOffsetInBytes = Builder.CreateMul(OutOffset, llvm::ConstantInt::get(Int32Ty, OutStep)); 1045 OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffsetInBytes); 1046 OutPtr = Builder.CreatePointerCast(OutPtr, OutTy); 1047 } 1048 1049 if (PassOutByPointer) { 1050 RootArgs.push_back(OutPtr); 1051 } 1052 } 1053 1054 // Inputs 1055 1056 if (NumInPtrArguments > 0) { 1057 ExpandInputsBody(Builder, Arg_x1, TBAAAllocation, NumInPtrArguments, 1058 InTypes, InBufPtrs, InStructTempSlots, IV, RootArgs); 1059 } 1060 1061 finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder); 1062 1063 llvm::Value *RetVal = Builder.CreateCall(Function, RootArgs); 1064 1065 if (OutPtr && !PassOutByPointer) { 1066 RetVal->setName("call.result"); 1067 llvm::StoreInst *Store = Builder.CreateStore(RetVal, OutPtr); 1068 if (gEnableRsTbaa) { 1069 Store->setMetadata("tbaa", TBAAAllocation); 1070 } 1071 } 1072 1073 return true; 1074 } 1075 1076 // Certain categories of functions that make up a general 1077 // reduce-style kernel are called directly from the driver with no 1078 // expansion needed. For a function in such a category, we need to 1079 // promote linkage from static to external, to ensure that the 1080 // function is visible to the driver in the dynamic symbol table. 1081 // This promotion is safe because we don't have any kind of cross 1082 // translation unit linkage model (except for linking against 1083 // RenderScript libraries), so we do not risk name clashes. 1084 bool PromoteReduceFunction(const char *Name, FunctionSet &PromotedFunctions) { 1085 if (!Name) // a presumably-optional function that is not present 1086 return false; 1087 1088 llvm::Function *Fn = Module->getFunction(Name); 1089 bccAssert(Fn != nullptr); 1090 if (PromotedFunctions.insert(Fn).second) { 1091 bccAssert(Fn->getLinkage() == llvm::GlobalValue::InternalLinkage); 1092 Fn->setLinkage(llvm::GlobalValue::ExternalLinkage); 1093 return true; 1094 } 1095 1096 return false; 1097 } 1098 1099 // Expand the accumulator function for a general reduce-style kernel. 1100 // 1101 // The input is a function of the form 1102 // 1103 // define void @func(accumType* %accum, foo1 in1[, ... fooN inN] [, special arguments]) 1104 // 1105 // where all arguments except the first are the same as for a foreach kernel. 1106 // 1107 // The input accumulator function gets expanded into a function of the form 1108 // 1109 // define void @func.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, accumType* %accum) 1110 // 1111 // which performs a serial accumulaion of elements [x1, x2) into *%accum. 1112 // 1113 // In pseudocode, @func.expand does: 1114 // 1115 // for (i = %x1; i < %x2; ++i) { 1116 // func(%accum, 1117 // *((foo1 *)p->inPtr[0] + i)[, ... *((fooN *)p->inPtr[N-1] + i) 1118 // [, p] [, i] [, p->current.y] [, p->current.z]); 1119 // } 1120 // 1121 // This is very similar to foreach kernel expansion with no output. 1122 bool ExpandReduceAccumulator(llvm::Function *FnAccumulator, uint32_t Signature, size_t NumInputs) { 1123 ALOGV("Expanding accumulator %s for general reduce kernel", 1124 FnAccumulator->getName().str().c_str()); 1125 1126 // Create TBAA meta-data. 1127 llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, 1128 *TBAAAllocation, *TBAAPointer; 1129 llvm::MDBuilder MDHelper(*Context); 1130 TBAARenderScriptDistinct = 1131 MDHelper.createTBAARoot(kRenderScriptTBAARootName); 1132 TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName, 1133 TBAARenderScriptDistinct); 1134 TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation", 1135 TBAARenderScript); 1136 TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation, 1137 TBAAAllocation, 0); 1138 TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer", 1139 TBAARenderScript); 1140 TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0); 1141 1142 auto AccumulatorArgIter = FnAccumulator->arg_begin(); 1143 1144 // Create empty accumulator function. 1145 llvm::Function *FnExpandedAccumulator = 1146 createEmptyExpandedReduceAccumulator(FnAccumulator->getName(), 1147 (AccumulatorArgIter++)->getType()); 1148 1149 // Extract the expanded accumulator's parameters. It is 1150 // guaranteed by createEmptyExpandedReduceAccumulator that 1151 // there will be 4 parameters. 1152 bccAssert(FnExpandedAccumulator->arg_size() == kNumExpandedReduceAccumulatorParams); 1153 auto ExpandedAccumulatorArgIter = FnExpandedAccumulator->arg_begin(); 1154 llvm::Value *Arg_p = &*(ExpandedAccumulatorArgIter++); 1155 llvm::Value *Arg_x1 = &*(ExpandedAccumulatorArgIter++); 1156 llvm::Value *Arg_x2 = &*(ExpandedAccumulatorArgIter++); 1157 llvm::Value *Arg_accum = &*(ExpandedAccumulatorArgIter++); 1158 1159 // Construct the actual function body. 1160 llvm::IRBuilder<> Builder(&*FnExpandedAccumulator->getEntryBlock().begin()); 1161 1162 // Create the loop structure. 1163 llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock(); 1164 llvm::Value *IndVar; 1165 createLoop(Builder, Arg_x1, Arg_x2, &IndVar); 1166 1167 llvm::SmallVector<llvm::Value*, 8> CalleeArgs; 1168 const int CalleeArgsContextIdx = 1169 ExpandSpecialArguments(Signature, IndVar, Arg_p, Builder, CalleeArgs, 1170 [](){}, LoopHeader->getTerminator()); 1171 1172 llvm::SmallVector<llvm::Type*, 8> InTypes; 1173 llvm::SmallVector<llvm::Value*, 8> InBufPtrs; 1174 llvm::SmallVector<llvm::Value*, 8> InStructTempSlots; 1175 ExpandInputsLoopInvariant(Builder, LoopHeader, Arg_p, TBAAPointer, AccumulatorArgIter, NumInputs, 1176 InTypes, InBufPtrs, InStructTempSlots); 1177 1178 // Populate the actual call to the original accumulator. 1179 llvm::SmallVector<llvm::Value*, 8> RootArgs; 1180 RootArgs.push_back(Arg_accum); 1181 ExpandInputsBody(Builder, Arg_x1, TBAAAllocation, NumInputs, InTypes, InBufPtrs, InStructTempSlots, 1182 IndVar, RootArgs); 1183 finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *FnAccumulator, Builder); 1184 Builder.CreateCall(FnAccumulator, RootArgs); 1185 1186 return true; 1187 } 1188 1189 // Create a combiner function for a general reduce-style kernel that lacks one, 1190 // by calling the accumulator function. 1191 // 1192 // The accumulator function must be of the form 1193 // 1194 // define void @accumFn(accumType* %accum, accumType %in) 1195 // 1196 // A combiner function will be generated of the form 1197 // 1198 // define void @accumFn.combiner(accumType* %accum, accumType* %other) { 1199 // %1 = load accumType, accumType* %other 1200 // call void @accumFn(accumType* %accum, accumType %1); 1201 // } 1202 bool CreateReduceCombinerFromAccumulator(llvm::Function *FnAccumulator) { 1203 ALOGV("Creating combiner from accumulator %s for general reduce kernel", 1204 FnAccumulator->getName().str().c_str()); 1205 1206 using llvm::Attribute; 1207 1208 bccAssert(FnAccumulator->arg_size() == 2); 1209 auto AccumulatorArgIter = FnAccumulator->arg_begin(); 1210 llvm::Value *AccumulatorArg_accum = &*(AccumulatorArgIter++); 1211 llvm::Value *AccumulatorArg_in = &*(AccumulatorArgIter++); 1212 llvm::Type *AccumulatorArgType = AccumulatorArg_accum->getType(); 1213 bccAssert(AccumulatorArgType->isPointerTy()); 1214 1215 llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context); 1216 llvm::FunctionType *CombinerType = 1217 llvm::FunctionType::get(VoidTy, { AccumulatorArgType, AccumulatorArgType }, false); 1218 llvm::Function *FnCombiner = 1219 llvm::Function::Create(CombinerType, llvm::GlobalValue::ExternalLinkage, 1220 nameReduceCombinerFromAccumulator(FnAccumulator->getName()), 1221 Module); 1222 1223 auto CombinerArgIter = FnCombiner->arg_begin(); 1224 1225 llvm::Argument *CombinerArg_accum = &(*CombinerArgIter++); 1226 CombinerArg_accum->setName("accum"); 1227 CombinerArg_accum->addAttr(llvm::AttributeSet::get(*Context, CombinerArg_accum->getArgNo() + 1, 1228 llvm::makeArrayRef(Attribute::NoCapture))); 1229 1230 llvm::Argument *CombinerArg_other = &(*CombinerArgIter++); 1231 CombinerArg_other->setName("other"); 1232 CombinerArg_other->addAttr(llvm::AttributeSet::get(*Context, CombinerArg_other->getArgNo() + 1, 1233 llvm::makeArrayRef(Attribute::NoCapture))); 1234 1235 llvm::BasicBlock *BB = llvm::BasicBlock::Create(*Context, "BB", FnCombiner); 1236 llvm::IRBuilder<> Builder(BB); 1237 1238 if (AccumulatorArg_in->getType()->isPointerTy()) { 1239 // Types of sufficient size get passed by pointer-to-copy rather 1240 // than passed by value. An accumulator cannot take a pointer 1241 // at the user level; so if we see a pointer here, we know that 1242 // we have a pass-by-pointer-to-copy case. 1243 llvm::Type *ElementType = AccumulatorArg_in->getType()->getPointerElementType(); 1244 llvm::Value *TempMem = Builder.CreateAlloca(ElementType, nullptr, "caller_copy"); 1245 Builder.CreateStore(Builder.CreateLoad(CombinerArg_other), TempMem); 1246 Builder.CreateCall(FnAccumulator, { CombinerArg_accum, TempMem }); 1247 } else { 1248 llvm::Value *TypeAdjustedOther = CombinerArg_other; 1249 if (AccumulatorArgType->getPointerElementType() != AccumulatorArg_in->getType()) { 1250 // Call lowering by frontend has done some type coercion 1251 TypeAdjustedOther = Builder.CreatePointerCast(CombinerArg_other, 1252 AccumulatorArg_in->getType()->getPointerTo(), 1253 "cast"); 1254 } 1255 llvm::Value *DerefOther = Builder.CreateLoad(TypeAdjustedOther); 1256 Builder.CreateCall(FnAccumulator, { CombinerArg_accum, DerefOther }); 1257 } 1258 Builder.CreateRetVoid(); 1259 1260 return true; 1261 } 1262 1263 /// @brief Checks if pointers to allocation internals are exposed 1264 /// 1265 /// This function verifies if through the parameters passed to the kernel 1266 /// or through calls to the runtime library the script gains access to 1267 /// pointers pointing to data within a RenderScript Allocation. 1268 /// If we know we control all loads from and stores to data within 1269 /// RenderScript allocations and if we know the run-time internal accesses 1270 /// are all annotated with RenderScript TBAA metadata, only then we 1271 /// can safely use TBAA to distinguish between generic and from-allocation 1272 /// pointers. 1273 bool allocPointersExposed(llvm::Module &Module) { 1274 // Old style kernel function can expose pointers to elements within 1275 // allocations. 1276 // TODO: Extend analysis to allow simple cases of old-style kernels. 1277 for (size_t i = 0; i < mExportForEachCount; ++i) { 1278 const char *Name = mExportForEachNameList[i]; 1279 uint32_t Signature = mExportForEachSignatureList[i]; 1280 if (Module.getFunction(Name) && 1281 !bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)) { 1282 return true; 1283 } 1284 } 1285 1286 // Check for library functions that expose a pointer to an Allocation or 1287 // that are not yet annotated with RenderScript-specific tbaa information. 1288 static const std::vector<const char *> Funcs{ 1289 // rsGetElementAt(...) 1290 "_Z14rsGetElementAt13rs_allocationj", 1291 "_Z14rsGetElementAt13rs_allocationjj", 1292 "_Z14rsGetElementAt13rs_allocationjjj", 1293 1294 // rsSetElementAt() 1295 "_Z14rsSetElementAt13rs_allocationPvj", 1296 "_Z14rsSetElementAt13rs_allocationPvjj", 1297 "_Z14rsSetElementAt13rs_allocationPvjjj", 1298 1299 // rsGetElementAtYuv_uchar_Y() 1300 "_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj", 1301 1302 // rsGetElementAtYuv_uchar_U() 1303 "_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj", 1304 1305 // rsGetElementAtYuv_uchar_V() 1306 "_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj", 1307 }; 1308 1309 for (auto FI : Funcs) { 1310 llvm::Function *Function = Module.getFunction(FI); 1311 1312 if (!Function) { 1313 ALOGE("Missing run-time function '%s'", FI); 1314 return true; 1315 } 1316 1317 if (Function->getNumUses() > 0) { 1318 return true; 1319 } 1320 } 1321 1322 return false; 1323 } 1324 1325 /// @brief Connect RenderScript TBAA metadata to C/C++ metadata 1326 /// 1327 /// The TBAA metadata used to annotate loads/stores from RenderScript 1328 /// Allocations is generated in a separate TBAA tree with a 1329 /// "RenderScript Distinct TBAA" root node. LLVM does assume may-alias for 1330 /// all nodes in unrelated alias analysis trees. This function makes the 1331 /// "RenderScript TBAA" node (which is parented by the Distinct TBAA root), 1332 /// a subtree of the normal C/C++ TBAA tree aside of normal C/C++ types. With 1333 /// the connected trees every access to an Allocation is resolved to 1334 /// must-alias if compared to a normal C/C++ access. 1335 void connectRenderScriptTBAAMetadata(llvm::Module &Module) { 1336 llvm::MDBuilder MDHelper(*Context); 1337 llvm::MDNode *TBAARenderScriptDistinct = 1338 MDHelper.createTBAARoot("RenderScript Distinct TBAA"); 1339 llvm::MDNode *TBAARenderScript = MDHelper.createTBAANode( 1340 "RenderScript TBAA", TBAARenderScriptDistinct); 1341 llvm::MDNode *TBAARoot = MDHelper.createTBAARoot("Simple C/C++ TBAA"); 1342 TBAARenderScript->replaceOperandWith(1, TBAARoot); 1343 } 1344 1345 virtual bool runOnModule(llvm::Module &Module) { 1346 bool Changed = false; 1347 this->Module = &Module; 1348 Context = &Module.getContext(); 1349 1350 buildTypes(); 1351 1352 bcinfo::MetadataExtractor me(&Module); 1353 if (!me.extract()) { 1354 ALOGE("Could not extract metadata from module!"); 1355 return false; 1356 } 1357 1358 // Expand forEach_* style kernels. 1359 mExportForEachCount = me.getExportForEachSignatureCount(); 1360 mExportForEachNameList = me.getExportForEachNameList(); 1361 mExportForEachSignatureList = me.getExportForEachSignatureList(); 1362 1363 for (size_t i = 0; i < mExportForEachCount; ++i) { 1364 const char *name = mExportForEachNameList[i]; 1365 uint32_t signature = mExportForEachSignatureList[i]; 1366 llvm::Function *kernel = Module.getFunction(name); 1367 if (kernel) { 1368 if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) { 1369 Changed |= ExpandForEach(kernel, signature); 1370 kernel->setLinkage(llvm::GlobalValue::InternalLinkage); 1371 } else if (kernel->getReturnType()->isVoidTy()) { 1372 Changed |= ExpandOldStyleForEach(kernel, signature); 1373 kernel->setLinkage(llvm::GlobalValue::InternalLinkage); 1374 } else { 1375 // There are some graphics root functions that are not 1376 // expanded, but that will be called directly. For those 1377 // functions, we can not set the linkage to internal. 1378 } 1379 } 1380 } 1381 1382 // Process general reduce_* style functions. 1383 const size_t ExportReduceCount = me.getExportReduceCount(); 1384 const bcinfo::MetadataExtractor::Reduce *ExportReduceList = me.getExportReduceList(); 1385 // Note that functions can be shared between kernels 1386 FunctionSet PromotedFunctions, ExpandedAccumulators, AccumulatorsForCombiners; 1387 1388 for (size_t i = 0; i < ExportReduceCount; ++i) { 1389 Changed |= PromoteReduceFunction(ExportReduceList[i].mInitializerName, PromotedFunctions); 1390 Changed |= PromoteReduceFunction(ExportReduceList[i].mCombinerName, PromotedFunctions); 1391 Changed |= PromoteReduceFunction(ExportReduceList[i].mOutConverterName, PromotedFunctions); 1392 1393 // Accumulator 1394 llvm::Function *accumulator = Module.getFunction(ExportReduceList[i].mAccumulatorName); 1395 bccAssert(accumulator != nullptr); 1396 if (ExpandedAccumulators.insert(accumulator).second) 1397 Changed |= ExpandReduceAccumulator(accumulator, 1398 ExportReduceList[i].mSignature, 1399 ExportReduceList[i].mInputCount); 1400 if (!ExportReduceList[i].mCombinerName) { 1401 if (AccumulatorsForCombiners.insert(accumulator).second) 1402 Changed |= CreateReduceCombinerFromAccumulator(accumulator); 1403 } 1404 } 1405 1406 if (gEnableRsTbaa && !allocPointersExposed(Module)) { 1407 connectRenderScriptTBAAMetadata(Module); 1408 } 1409 1410 return Changed; 1411 } 1412 1413 virtual const char *getPassName() const { 1414 return "forEach_* and reduce_* function expansion"; 1415 } 1416 1417}; // end RSKernelExpandPass 1418 1419} // end anonymous namespace 1420 1421char RSKernelExpandPass::ID = 0; 1422static llvm::RegisterPass<RSKernelExpandPass> X("kernelexp", "Kernel Expand Pass"); 1423 1424namespace bcc { 1425 1426const char BCC_INDEX_VAR_NAME[] = "rsIndex"; 1427 1428llvm::ModulePass * 1429createRSKernelExpandPass(bool pEnableStepOpt) { 1430 return new RSKernelExpandPass(pEnableStepOpt); 1431} 1432 1433} // end namespace bcc 1434