RSKernelExpand.cpp revision e32af52d4be2bb80783404d99fa338b1143dbc9a
1/* 2 * Copyright 2012, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "bcc/Assert.h" 18#include "bcc/Renderscript/RSTransforms.h" 19 20#include <cstdlib> 21#include <functional> 22#include <unordered_set> 23 24#include <llvm/IR/DerivedTypes.h> 25#include <llvm/IR/Function.h> 26#include <llvm/IR/Instructions.h> 27#include <llvm/IR/IRBuilder.h> 28#include <llvm/IR/MDBuilder.h> 29#include <llvm/IR/Module.h> 30#include <llvm/Pass.h> 31#include <llvm/Support/raw_ostream.h> 32#include <llvm/IR/DataLayout.h> 33#include <llvm/IR/Function.h> 34#include <llvm/IR/Type.h> 35#include <llvm/Transforms/Utils/BasicBlockUtils.h> 36 37#include "bcc/Config/Config.h" 38#include "bcc/Support/Log.h" 39 40#include "bcinfo/MetadataExtractor.h" 41 42#ifndef __DISABLE_ASSERTS 43// Only used in bccAssert() 44const int kNumExpandedForeachParams = 4; 45const int kNumExpandedReduceParams = 3; 46const int kNumExpandedReduceNewAccumulatorParams = 4; 47#endif 48 49const char kRenderScriptTBAARootName[] = "RenderScript Distinct TBAA"; 50const char kRenderScriptTBAANodeName[] = "RenderScript TBAA"; 51 52using namespace bcc; 53 54namespace { 55 56static const bool gEnableRsTbaa = true; 57 58/* RSKernelExpandPass - This pass operates on functions that are able 59 * to be called via rsForEach(), "foreach_<NAME>", or 60 * "reduce_<NAME>". We create an inner loop for the function to be 61 * invoked over the appropriate data cells of the input/output 62 * allocations (adjusting other relevant parameters as we go). We 63 * support doing this for any forEach or reduce style compute 64 * kernels. The new function name is the original function name 65 * followed by ".expand". Note that we still generate code for the 66 * original function. 67 */ 68class RSKernelExpandPass : public llvm::ModulePass { 69public: 70 static char ID; 71 72private: 73 static const size_t RS_KERNEL_INPUT_LIMIT = 8; // see frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h 74 75 typedef std::unordered_set<llvm::Function *> FunctionSet; 76 77 enum RsLaunchDimensionsField { 78 RsLaunchDimensionsFieldX, 79 RsLaunchDimensionsFieldY, 80 RsLaunchDimensionsFieldZ, 81 RsLaunchDimensionsFieldLod, 82 RsLaunchDimensionsFieldFace, 83 RsLaunchDimensionsFieldArray, 84 85 RsLaunchDimensionsFieldCount 86 }; 87 88 enum RsExpandKernelDriverInfoPfxField { 89 RsExpandKernelDriverInfoPfxFieldInPtr, 90 RsExpandKernelDriverInfoPfxFieldInStride, 91 RsExpandKernelDriverInfoPfxFieldInLen, 92 RsExpandKernelDriverInfoPfxFieldOutPtr, 93 RsExpandKernelDriverInfoPfxFieldOutStride, 94 RsExpandKernelDriverInfoPfxFieldOutLen, 95 RsExpandKernelDriverInfoPfxFieldDim, 96 RsExpandKernelDriverInfoPfxFieldCurrent, 97 RsExpandKernelDriverInfoPfxFieldUsr, 98 RsExpandKernelDriverInfoPfxFieldUsLenr, 99 100 RsExpandKernelDriverInfoPfxFieldCount 101 }; 102 103 llvm::Module *Module; 104 llvm::LLVMContext *Context; 105 106 /* 107 * Pointers to LLVM type information for the the function signatures 108 * for expanded functions. These must be re-calculated for each module 109 * the pass is run on. 110 */ 111 llvm::FunctionType *ExpandedForEachType, *ExpandedReduceType; 112 llvm::Type *RsExpandKernelDriverInfoPfxTy; 113 114 uint32_t mExportForEachCount; 115 const char **mExportForEachNameList; 116 const uint32_t *mExportForEachSignatureList; 117 118 uint32_t mExportReduceCount; 119 const char **mExportReduceNameList; 120 121 // Turns on optimization of allocation stride values. 122 bool mEnableStepOpt; 123 124 uint32_t getRootSignature(llvm::Function *Function) { 125 const llvm::NamedMDNode *ExportForEachMetadata = 126 Module->getNamedMetadata("#rs_export_foreach"); 127 128 if (!ExportForEachMetadata) { 129 llvm::SmallVector<llvm::Type*, 8> RootArgTys; 130 for (llvm::Function::arg_iterator B = Function->arg_begin(), 131 E = Function->arg_end(); 132 B != E; 133 ++B) { 134 RootArgTys.push_back(B->getType()); 135 } 136 137 // For pre-ICS bitcode, we may not have signature information. In that 138 // case, we use the size of the RootArgTys to select the number of 139 // arguments. 140 return (1 << RootArgTys.size()) - 1; 141 } 142 143 if (ExportForEachMetadata->getNumOperands() == 0) { 144 return 0; 145 } 146 147 bccAssert(ExportForEachMetadata->getNumOperands() > 0); 148 149 // We only handle the case for legacy root() functions here, so this is 150 // hard-coded to look at only the first such function. 151 llvm::MDNode *SigNode = ExportForEachMetadata->getOperand(0); 152 if (SigNode != nullptr && SigNode->getNumOperands() == 1) { 153 llvm::Metadata *SigMD = SigNode->getOperand(0); 154 if (llvm::MDString *SigS = llvm::dyn_cast<llvm::MDString>(SigMD)) { 155 llvm::StringRef SigString = SigS->getString(); 156 uint32_t Signature = 0; 157 if (SigString.getAsInteger(10, Signature)) { 158 ALOGE("Non-integer signature value '%s'", SigString.str().c_str()); 159 return 0; 160 } 161 return Signature; 162 } 163 } 164 165 return 0; 166 } 167 168 bool isStepOptSupported(llvm::Type *AllocType) { 169 170 llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType); 171 llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context); 172 173 if (mEnableStepOpt) { 174 return false; 175 } 176 177 if (AllocType == VoidPtrTy) { 178 return false; 179 } 180 181 if (!PT) { 182 return false; 183 } 184 185 // remaining conditions are 64-bit only 186 if (VoidPtrTy->getPrimitiveSizeInBits() == 32) { 187 return true; 188 } 189 190 // coerce suggests an upconverted struct type, which we can't support 191 if (AllocType->getStructName().find("coerce") != llvm::StringRef::npos) { 192 return false; 193 } 194 195 // 2xi64 and i128 suggest an upconverted struct type, which are also unsupported 196 llvm::Type *V2xi64Ty = llvm::VectorType::get(llvm::Type::getInt64Ty(*Context), 2); 197 llvm::Type *Int128Ty = llvm::Type::getIntNTy(*Context, 128); 198 if (AllocType == V2xi64Ty || AllocType == Int128Ty) { 199 return false; 200 } 201 202 return true; 203 } 204 205 // Get the actual value we should use to step through an allocation. 206 // 207 // Normally the value we use to step through an allocation is given to us by 208 // the driver. However, for certain primitive data types, we can derive an 209 // integer constant for the step value. We use this integer constant whenever 210 // possible to allow further compiler optimizations to take place. 211 // 212 // DL - Target Data size/layout information. 213 // T - Type of allocation (should be a pointer). 214 // OrigStep - Original step increment (root.expand() input from driver). 215 llvm::Value *getStepValue(llvm::DataLayout *DL, llvm::Type *AllocType, 216 llvm::Value *OrigStep) { 217 bccAssert(DL); 218 bccAssert(AllocType); 219 bccAssert(OrigStep); 220 llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType); 221 if (isStepOptSupported(AllocType)) { 222 llvm::Type *ET = PT->getElementType(); 223 uint64_t ETSize = DL->getTypeAllocSize(ET); 224 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 225 return llvm::ConstantInt::get(Int32Ty, ETSize); 226 } else { 227 return OrigStep; 228 } 229 } 230 231 /// Builds the types required by the pass for the given context. 232 void buildTypes(void) { 233 // Create the RsLaunchDimensionsTy and RsExpandKernelDriverInfoPfxTy structs. 234 235 llvm::Type *Int8Ty = llvm::Type::getInt8Ty(*Context); 236 llvm::Type *Int8PtrTy = Int8Ty->getPointerTo(); 237 llvm::Type *Int8PtrArrayInputLimitTy = llvm::ArrayType::get(Int8PtrTy, RS_KERNEL_INPUT_LIMIT); 238 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 239 llvm::Type *Int32ArrayInputLimitTy = llvm::ArrayType::get(Int32Ty, RS_KERNEL_INPUT_LIMIT); 240 llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context); 241 llvm::Type *Int32Array4Ty = llvm::ArrayType::get(Int32Ty, 4); 242 243 /* Defined in frameworks/base/libs/rs/cpu_ref/rsCpuCore.h: 244 * 245 * struct RsLaunchDimensions { 246 * uint32_t x; 247 * uint32_t y; 248 * uint32_t z; 249 * uint32_t lod; 250 * uint32_t face; 251 * uint32_t array[4]; 252 * }; 253 */ 254 llvm::SmallVector<llvm::Type*, RsLaunchDimensionsFieldCount> RsLaunchDimensionsTypes; 255 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t x 256 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t y 257 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t z 258 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t lod 259 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t face 260 RsLaunchDimensionsTypes.push_back(Int32Array4Ty); // uint32_t array[4] 261 llvm::StructType *RsLaunchDimensionsTy = 262 llvm::StructType::create(RsLaunchDimensionsTypes, "RsLaunchDimensions"); 263 264 /* Defined as the beginning of RsExpandKernelDriverInfo in frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h: 265 * 266 * struct RsExpandKernelDriverInfoPfx { 267 * const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT]; 268 * uint32_t inStride[RS_KERNEL_INPUT_LIMIT]; 269 * uint32_t inLen; 270 * 271 * uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT]; 272 * uint32_t outStride[RS_KERNEL_INPUT_LIMIT]; 273 * uint32_t outLen; 274 * 275 * // Dimension of the launch 276 * RsLaunchDimensions dim; 277 * 278 * // The walking iterator of the launch 279 * RsLaunchDimensions current; 280 * 281 * const void *usr; 282 * uint32_t usrLen; 283 * 284 * // Items below this line are not used by the compiler and can be change in the driver. 285 * // So the compiler must assume there are an unknown number of fields of unknown type 286 * // beginning here. 287 * }; 288 * 289 * The name "RsExpandKernelDriverInfoPfx" is known to RSInvariantPass (RSInvariant.cpp). 290 */ 291 llvm::SmallVector<llvm::Type*, RsExpandKernelDriverInfoPfxFieldCount> RsExpandKernelDriverInfoPfxTypes; 292 RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT] 293 RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy); // uint32_t inStride[RS_KERNEL_INPUT_LIMIT] 294 RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t inLen 295 RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT] 296 RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy); // uint32_t outStride[RS_KERNEL_INPUT_LIMIT] 297 RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t outLen 298 RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy); // RsLaunchDimensions dim 299 RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy); // RsLaunchDimensions current 300 RsExpandKernelDriverInfoPfxTypes.push_back(VoidPtrTy); // const void *usr 301 RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t usrLen 302 RsExpandKernelDriverInfoPfxTy = 303 llvm::StructType::create(RsExpandKernelDriverInfoPfxTypes, "RsExpandKernelDriverInfoPfx"); 304 305 // Create the function type for expanded kernels. 306 llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context); 307 308 llvm::Type *RsExpandKernelDriverInfoPfxPtrTy = RsExpandKernelDriverInfoPfxTy->getPointerTo(); 309 // void (const RsExpandKernelDriverInfoPfxTy *p, uint32_t x1, uint32_t x2, uint32_t outstep) 310 ExpandedForEachType = llvm::FunctionType::get(VoidTy, 311 {RsExpandKernelDriverInfoPfxPtrTy, Int32Ty, Int32Ty, Int32Ty}, false); 312 313 // void (void *inBuf, void *outBuf, uint32_t len) 314 ExpandedReduceType = llvm::FunctionType::get(VoidTy, {VoidPtrTy, VoidPtrTy, Int32Ty}, false); 315 } 316 317 /// @brief Create skeleton of the expanded foreach kernel. 318 /// 319 /// This creates a function with the following signature: 320 /// 321 /// void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2, 322 /// uint32_t outstep) 323 /// 324 llvm::Function *createEmptyExpandedForEachKernel(llvm::StringRef OldName) { 325 llvm::Function *ExpandedFunction = 326 llvm::Function::Create(ExpandedForEachType, 327 llvm::GlobalValue::ExternalLinkage, 328 OldName + ".expand", Module); 329 bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams); 330 llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin(); 331 (AI++)->setName("p"); 332 (AI++)->setName("x1"); 333 (AI++)->setName("x2"); 334 (AI++)->setName("arg_outstep"); 335 llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin", 336 ExpandedFunction); 337 llvm::IRBuilder<> Builder(Begin); 338 Builder.CreateRetVoid(); 339 return ExpandedFunction; 340 } 341 342 // Create skeleton of the expanded reduce kernel. 343 // 344 // This creates a function with the following signature: 345 // 346 // void @func.expand(i8* nocapture %inBuf, i8* nocapture %outBuf, i32 len) 347 // 348 llvm::Function *createEmptyExpandedReduceKernel(llvm::StringRef OldName) { 349 llvm::Function *ExpandedFunction = 350 llvm::Function::Create(ExpandedReduceType, 351 llvm::GlobalValue::ExternalLinkage, 352 OldName + ".expand", Module); 353 bccAssert(ExpandedFunction->arg_size() == kNumExpandedReduceParams); 354 355 llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin(); 356 357 using llvm::Attribute; 358 359 llvm::Argument *InBuf = &(*AI++); 360 InBuf->setName("inBuf"); 361 InBuf->addAttr(llvm::AttributeSet::get(*Context, InBuf->getArgNo() + 1, llvm::makeArrayRef(Attribute::NoCapture))); 362 363 llvm::Argument *OutBuf = &(*AI++); 364 OutBuf->setName("outBuf"); 365 OutBuf->addAttr(llvm::AttributeSet::get(*Context, OutBuf->getArgNo() + 1, llvm::makeArrayRef(Attribute::NoCapture))); 366 367 (AI++)->setName("len"); 368 369 llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin", 370 ExpandedFunction); 371 llvm::IRBuilder<> Builder(Begin); 372 Builder.CreateRetVoid(); 373 374 return ExpandedFunction; 375 } 376 377 // Create skeleton of a general reduce kernel's expanded accumulator. 378 // 379 // This creates a function with the following signature: 380 // 381 // void @func.expand(%RsExpandKernelDriverInfoPfx* nocapture %p, 382 // i32 %x1, i32 %x2, accumType* nocapture %accum) 383 // 384 llvm::Function *createEmptyExpandedReduceNewAccumulator(llvm::StringRef OldName, 385 llvm::Type *AccumArgTy) { 386 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 387 llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context); 388 llvm::FunctionType *ExpandedReduceNewAccumulatorType = 389 llvm::FunctionType::get(VoidTy, 390 {RsExpandKernelDriverInfoPfxTy->getPointerTo(), 391 Int32Ty, Int32Ty, AccumArgTy}, false); 392 llvm::Function *FnExpandedAccumulator = 393 llvm::Function::Create(ExpandedReduceNewAccumulatorType, 394 llvm::GlobalValue::ExternalLinkage, 395 OldName + ".expand", Module); 396 bccAssert(FnExpandedAccumulator->arg_size() == kNumExpandedReduceNewAccumulatorParams); 397 398 llvm::Function::arg_iterator AI = FnExpandedAccumulator->arg_begin(); 399 400 using llvm::Attribute; 401 402 llvm::Argument *Arg_p = &(*AI++); 403 Arg_p->setName("p"); 404 Arg_p->addAttr(llvm::AttributeSet::get(*Context, Arg_p->getArgNo() + 1, 405 llvm::makeArrayRef(Attribute::NoCapture))); 406 407 llvm::Argument *Arg_x1 = &(*AI++); 408 Arg_x1->setName("x1"); 409 410 llvm::Argument *Arg_x2 = &(*AI++); 411 Arg_x2->setName("x2"); 412 413 llvm::Argument *Arg_accum = &(*AI++); 414 Arg_accum->setName("accum"); 415 Arg_accum->addAttr(llvm::AttributeSet::get(*Context, Arg_accum->getArgNo() + 1, 416 llvm::makeArrayRef(Attribute::NoCapture))); 417 418 llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin", 419 FnExpandedAccumulator); 420 llvm::IRBuilder<> Builder(Begin); 421 Builder.CreateRetVoid(); 422 423 return FnExpandedAccumulator; 424 } 425 426 /// @brief Create an empty loop 427 /// 428 /// Create a loop of the form: 429 /// 430 /// for (i = LowerBound; i < UpperBound; i++) 431 /// ; 432 /// 433 /// After the loop has been created, the builder is set such that 434 /// instructions can be added to the loop body. 435 /// 436 /// @param Builder The builder to use to build this loop. The current 437 /// position of the builder is the position the loop 438 /// will be inserted. 439 /// @param LowerBound The first value of the loop iterator 440 /// @param UpperBound The maximal value of the loop iterator 441 /// @param LoopIV A reference that will be set to the loop iterator. 442 /// @return The BasicBlock that will be executed after the loop. 443 llvm::BasicBlock *createLoop(llvm::IRBuilder<> &Builder, 444 llvm::Value *LowerBound, 445 llvm::Value *UpperBound, 446 llvm::PHINode **LoopIV) { 447 bccAssert(LowerBound->getType() == UpperBound->getType()); 448 449 llvm::BasicBlock *CondBB, *AfterBB, *HeaderBB; 450 llvm::Value *Cond, *IVNext; 451 llvm::PHINode *IV; 452 453 CondBB = Builder.GetInsertBlock(); 454 AfterBB = llvm::SplitBlock(CondBB, Builder.GetInsertPoint(), nullptr, nullptr); 455 HeaderBB = llvm::BasicBlock::Create(*Context, "Loop", CondBB->getParent()); 456 457 // if (LowerBound < Upperbound) 458 // goto LoopHeader 459 // else 460 // goto AfterBB 461 CondBB->getTerminator()->eraseFromParent(); 462 Builder.SetInsertPoint(CondBB); 463 Cond = Builder.CreateICmpULT(LowerBound, UpperBound); 464 Builder.CreateCondBr(Cond, HeaderBB, AfterBB); 465 466 // iv = PHI [CondBB -> LowerBound], [LoopHeader -> NextIV ] 467 // iv.next = iv + 1 468 // if (iv.next < Upperbound) 469 // goto LoopHeader 470 // else 471 // goto AfterBB 472 Builder.SetInsertPoint(HeaderBB); 473 IV = Builder.CreatePHI(LowerBound->getType(), 2, "X"); 474 IV->addIncoming(LowerBound, CondBB); 475 IVNext = Builder.CreateNUWAdd(IV, Builder.getInt32(1)); 476 IV->addIncoming(IVNext, HeaderBB); 477 Cond = Builder.CreateICmpULT(IVNext, UpperBound); 478 Builder.CreateCondBr(Cond, HeaderBB, AfterBB); 479 AfterBB->setName("Exit"); 480 Builder.SetInsertPoint(HeaderBB->getFirstNonPHI()); 481 *LoopIV = IV; 482 return AfterBB; 483 } 484 485 // Finish building the outgoing argument list for calling a ForEach-able function. 486 // 487 // ArgVector - on input, the non-special arguments 488 // on output, the non-special arguments combined with the special arguments 489 // from SpecialArgVector 490 // SpecialArgVector - special arguments (from ExpandSpecialArguments()) 491 // SpecialArgContextIdx - return value of ExpandSpecialArguments() 492 // (position of context argument in SpecialArgVector) 493 // CalleeFunction - the ForEach-able function being called 494 // Builder - for inserting code into the caller function 495 template<unsigned int ArgVectorLen, unsigned int SpecialArgVectorLen> 496 void finishArgList( llvm::SmallVector<llvm::Value *, ArgVectorLen> &ArgVector, 497 const llvm::SmallVector<llvm::Value *, SpecialArgVectorLen> &SpecialArgVector, 498 const int SpecialArgContextIdx, 499 const llvm::Function &CalleeFunction, 500 llvm::IRBuilder<> &CallerBuilder) { 501 /* The context argument (if any) is a pointer to an opaque user-visible type that differs from 502 * the RsExpandKernelDriverInfoPfx type used in the function we are generating (although the 503 * two types represent the same thing). Therefore, we must introduce a pointer cast when 504 * generating a call to the kernel function. 505 */ 506 const int ArgContextIdx = 507 SpecialArgContextIdx >= 0 ? (ArgVector.size() + SpecialArgContextIdx) : SpecialArgContextIdx; 508 ArgVector.append(SpecialArgVector.begin(), SpecialArgVector.end()); 509 if (ArgContextIdx >= 0) { 510 llvm::Type *ContextArgType = nullptr; 511 int ArgIdx = ArgContextIdx; 512 for (const auto &Arg : CalleeFunction.getArgumentList()) { 513 if (!ArgIdx--) { 514 ContextArgType = Arg.getType(); 515 break; 516 } 517 } 518 bccAssert(ContextArgType); 519 ArgVector[ArgContextIdx] = CallerBuilder.CreatePointerCast(ArgVector[ArgContextIdx], ContextArgType); 520 } 521 } 522 523 // GEPHelper() returns a SmallVector of values suitable for passing 524 // to IRBuilder::CreateGEP(), and SmallGEPIndices is a typedef for 525 // the returned data type. It is sized so that the SmallVector 526 // returned by GEPHelper() never needs to do a heap allocation for 527 // any list of GEP indices it encounters in the code. 528 typedef llvm::SmallVector<llvm::Value *, 3> SmallGEPIndices; 529 530 // Helper for turning a list of constant integer GEP indices into a 531 // SmallVector of llvm::Value*. The return value is suitable for 532 // passing to a GetElementPtrInst constructor or IRBuilder::CreateGEP(). 533 // 534 // Inputs: 535 // I32Args should be integers which represent the index arguments 536 // to a GEP instruction. 537 // 538 // Returns: 539 // Returns a SmallVector of ConstantInts. 540 SmallGEPIndices GEPHelper(const std::initializer_list<int32_t> I32Args) { 541 SmallGEPIndices Out(I32Args.size()); 542 llvm::IntegerType *I32Ty = llvm::Type::getInt32Ty(*Context); 543 std::transform(I32Args.begin(), I32Args.end(), Out.begin(), 544 [I32Ty](int32_t Arg) { return llvm::ConstantInt::get(I32Ty, Arg); }); 545 return Out; 546 } 547 548public: 549 RSKernelExpandPass(bool pEnableStepOpt = true) 550 : ModulePass(ID), Module(nullptr), Context(nullptr), 551 mEnableStepOpt(pEnableStepOpt) { 552 553 } 554 555 virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override { 556 // This pass does not use any other analysis passes, but it does 557 // add/wrap the existing functions in the module (thus altering the CFG). 558 } 559 560 // Build contribution to outgoing argument list for calling a 561 // ForEach-able function or a general reduction accumulator 562 // function, based on the special parameters of that function. 563 // 564 // Signature - metadata bits for the signature of the callee 565 // X, Arg_p - values derived directly from expanded function, 566 // suitable for computing arguments for the callee 567 // CalleeArgs - contribution is accumulated here 568 // Bump - invoked once for each contributed outgoing argument 569 // LoopHeaderInsertionPoint - an Instruction in the loop header, before which 570 // this function can insert loop-invariant loads 571 // 572 // Return value is the (zero-based) position of the context (Arg_p) 573 // argument in the CalleeArgs vector, or a negative value if the 574 // context argument is not placed in the CalleeArgs vector. 575 int ExpandSpecialArguments(uint32_t Signature, 576 llvm::Value *X, 577 llvm::Value *Arg_p, 578 llvm::IRBuilder<> &Builder, 579 llvm::SmallVector<llvm::Value*, 8> &CalleeArgs, 580 std::function<void ()> Bump, 581 llvm::Instruction *LoopHeaderInsertionPoint) { 582 583 bccAssert(CalleeArgs.empty()); 584 585 int Return = -1; 586 if (bcinfo::MetadataExtractor::hasForEachSignatureCtxt(Signature)) { 587 CalleeArgs.push_back(Arg_p); 588 Bump(); 589 Return = CalleeArgs.size() - 1; 590 } 591 592 if (bcinfo::MetadataExtractor::hasForEachSignatureX(Signature)) { 593 CalleeArgs.push_back(X); 594 Bump(); 595 } 596 597 if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature) || 598 bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) { 599 bccAssert(LoopHeaderInsertionPoint); 600 601 // Y and Z are loop invariant, so they can be hoisted out of the 602 // loop. Set the IRBuilder insertion point to the loop header. 603 auto OldInsertionPoint = Builder.saveIP(); 604 Builder.SetInsertPoint(LoopHeaderInsertionPoint); 605 606 if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature)) { 607 SmallGEPIndices YValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent, 608 RsLaunchDimensionsFieldY})); 609 llvm::Value *YAddr = Builder.CreateInBoundsGEP(Arg_p, YValueGEP, "Y.gep"); 610 CalleeArgs.push_back(Builder.CreateLoad(YAddr, "Y")); 611 Bump(); 612 } 613 614 if (bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) { 615 SmallGEPIndices ZValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent, 616 RsLaunchDimensionsFieldZ})); 617 llvm::Value *ZAddr = Builder.CreateInBoundsGEP(Arg_p, ZValueGEP, "Z.gep"); 618 CalleeArgs.push_back(Builder.CreateLoad(ZAddr, "Z")); 619 Bump(); 620 } 621 622 Builder.restoreIP(OldInsertionPoint); 623 } 624 625 return Return; 626 } 627 628 // Generate loop-invariant input processing setup code for an expanded 629 // ForEach-able function or an expanded general reduction accumulator 630 // function. 631 // 632 // LoopHeader - block at the end of which the setup code will be inserted 633 // Arg_p - RSKernelDriverInfo pointer passed to the expanded function 634 // TBAAPointer - metadata for marking loads of pointer values out of RSKernelDriverInfo 635 // ArgIter - iterator pointing to first input of the UNexpanded function 636 // NumInputs - number of inputs (NOT number of ARGUMENTS) 637 // 638 // InBufPtrs[] - this function sets each array element to point to the first 639 // cell of the corresponding input allocation 640 // InStructTempSlots[] - this function sets each array element either to nullptr 641 // or to the result of an alloca (for the case where the 642 // calling convention dictates that a value must be passed 643 // by reference, and so we need a stacked temporary to hold 644 // a copy of that value) 645 void ExpandInputsLoopInvariant(llvm::IRBuilder<> &Builder, llvm::BasicBlock *LoopHeader, 646 llvm::Value *Arg_p, 647 llvm::MDNode *TBAAPointer, 648 llvm::Function::arg_iterator ArgIter, 649 const size_t NumInputs, 650 llvm::SmallVectorImpl<llvm::Value *> &InBufPtrs, 651 llvm::SmallVectorImpl<llvm::Value *> &InStructTempSlots) { 652 bccAssert(NumInputs <= RS_KERNEL_INPUT_LIMIT); 653 654 // Extract information about input slots. The work done 655 // here is loop-invariant, so we can hoist the operations out of the loop. 656 auto OldInsertionPoint = Builder.saveIP(); 657 Builder.SetInsertPoint(LoopHeader->getTerminator()); 658 659 for (size_t InputIndex = 0; InputIndex < NumInputs; ++InputIndex, ArgIter++) { 660 llvm::Type *InType = ArgIter->getType(); 661 662 /* 663 * AArch64 calling conventions dictate that structs of sufficient size 664 * get passed by pointer instead of passed by value. This, combined 665 * with the fact that we don't allow kernels to operate on pointer 666 * data means that if we see a kernel with a pointer parameter we know 667 * that it is a struct input that has been promoted. As such we don't 668 * need to convert its type to a pointer. Later we will need to know 669 * to create a temporary copy on the stack, so we save this information 670 * in InStructTempSlots. 671 */ 672 if (auto PtrType = llvm::dyn_cast<llvm::PointerType>(InType)) { 673 llvm::Type *ElementType = PtrType->getElementType(); 674 InStructTempSlots.push_back(Builder.CreateAlloca(ElementType, nullptr, 675 "input_struct_slot")); 676 } else { 677 InType = InType->getPointerTo(); 678 InStructTempSlots.push_back(nullptr); 679 } 680 681 SmallGEPIndices InBufPtrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr, 682 static_cast<int32_t>(InputIndex)})); 683 llvm::Value *InBufPtrAddr = Builder.CreateInBoundsGEP(Arg_p, InBufPtrGEP, "input_buf.gep"); 684 llvm::LoadInst *InBufPtr = Builder.CreateLoad(InBufPtrAddr, "input_buf"); 685 llvm::Value *CastInBufPtr = Builder.CreatePointerCast(InBufPtr, InType, "casted_in"); 686 687 if (gEnableRsTbaa) { 688 InBufPtr->setMetadata("tbaa", TBAAPointer); 689 } 690 691 InBufPtrs.push_back(CastInBufPtr); 692 } 693 694 Builder.restoreIP(OldInsertionPoint); 695 } 696 697 // Generate loop-varying input processing code for an expanded ForEach-able function 698 // or an expanded general reduction accumulator function. Also, for the call to the 699 // UNexpanded function, collect the portion of the argument list corresponding to the 700 // inputs. 701 // 702 // Arg_x1 - first X coordinate to be processed by the expanded function 703 // TBAAAllocation - metadata for marking loads of input values out of allocations 704 // NumInputs -- number of inputs (NOT number of ARGUMENTS) 705 // InBufPtrs[] - this function consumes the information produced by ExpandInputsLoopInvariant() 706 // InStructTempSlots[] - this function consumes the information produced by ExpandInputsLoopInvariant() 707 // IndVar - value of loop induction variable (X coordinate) for a given loop iteration 708 // 709 // RootArgs - this function sets this to the list of outgoing argument values corresponding 710 // to the inputs 711 void ExpandInputsBody(llvm::IRBuilder<> &Builder, 712 llvm::Value *Arg_x1, 713 llvm::MDNode *TBAAAllocation, 714 const size_t NumInputs, 715 const llvm::SmallVectorImpl<llvm::Value *> &InBufPtrs, 716 const llvm::SmallVectorImpl<llvm::Value *> &InStructTempSlots, 717 llvm::Value *IndVar, 718 llvm::SmallVectorImpl<llvm::Value *> &RootArgs) { 719 llvm::Value *Offset = Builder.CreateSub(IndVar, Arg_x1); 720 721 for (size_t Index = 0; Index < NumInputs; ++Index) { 722 llvm::Value *InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], Offset); 723 llvm::Value *Input; 724 725 llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input"); 726 727 if (gEnableRsTbaa) { 728 InputLoad->setMetadata("tbaa", TBAAAllocation); 729 } 730 731 if (llvm::Value *TemporarySlot = InStructTempSlots[Index]) { 732 // Pass a pointer to a temporary on the stack, rather than 733 // passing a pointer to the original value. We do not want 734 // the kernel to potentially modify the input data. 735 736 // Note: don't annotate with TBAA, since the kernel might 737 // have its own TBAA annotations for the pointer argument. 738 Builder.CreateStore(InputLoad, TemporarySlot); 739 Input = TemporarySlot; 740 } else { 741 Input = InputLoad; 742 } 743 744 RootArgs.push_back(Input); 745 } 746 } 747 748 /* Performs the actual optimization on a selected function. On success, the 749 * Module will contain a new function of the name "<NAME>.expand" that 750 * invokes <NAME>() in a loop with the appropriate parameters. 751 */ 752 bool ExpandOldStyleForEach(llvm::Function *Function, uint32_t Signature) { 753 ALOGV("Expanding ForEach-able Function %s", 754 Function->getName().str().c_str()); 755 756 if (!Signature) { 757 Signature = getRootSignature(Function); 758 if (!Signature) { 759 // We couldn't determine how to expand this function based on its 760 // function signature. 761 return false; 762 } 763 } 764 765 llvm::DataLayout DL(Module); 766 767 llvm::Function *ExpandedFunction = 768 createEmptyExpandedForEachKernel(Function->getName()); 769 770 /* 771 * Extract the expanded function's parameters. It is guaranteed by 772 * createEmptyExpandedForEachKernel that there will be four parameters. 773 */ 774 775 bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams); 776 777 llvm::Function::arg_iterator ExpandedFunctionArgIter = 778 ExpandedFunction->arg_begin(); 779 780 llvm::Value *Arg_p = &*(ExpandedFunctionArgIter++); 781 llvm::Value *Arg_x1 = &*(ExpandedFunctionArgIter++); 782 llvm::Value *Arg_x2 = &*(ExpandedFunctionArgIter++); 783 llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter); 784 785 llvm::Value *InStep = nullptr; 786 llvm::Value *OutStep = nullptr; 787 788 // Construct the actual function body. 789 llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin()); 790 791 // Collect and construct the arguments for the kernel(). 792 // Note that we load any loop-invariant arguments before entering the Loop. 793 llvm::Function::arg_iterator FunctionArgIter = Function->arg_begin(); 794 795 llvm::Type *InTy = nullptr; 796 llvm::Value *InBufPtr = nullptr; 797 if (bcinfo::MetadataExtractor::hasForEachSignatureIn(Signature)) { 798 SmallGEPIndices InStepGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInStride, 0})); 799 llvm::LoadInst *InStepArg = Builder.CreateLoad( 800 Builder.CreateInBoundsGEP(Arg_p, InStepGEP, "instep_addr.gep"), "instep_addr"); 801 802 InTy = (FunctionArgIter++)->getType(); 803 InStep = getStepValue(&DL, InTy, InStepArg); 804 805 InStep->setName("instep"); 806 807 SmallGEPIndices InputAddrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr, 0})); 808 InBufPtr = Builder.CreateLoad( 809 Builder.CreateInBoundsGEP(Arg_p, InputAddrGEP, "input_buf.gep"), "input_buf"); 810 } 811 812 llvm::Type *OutTy = nullptr; 813 llvm::Value *OutBasePtr = nullptr; 814 if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) { 815 OutTy = (FunctionArgIter++)->getType(); 816 OutStep = getStepValue(&DL, OutTy, Arg_outstep); 817 OutStep->setName("outstep"); 818 SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0})); 819 OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep")); 820 } 821 822 llvm::Value *UsrData = nullptr; 823 if (bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)) { 824 llvm::Type *UsrDataTy = (FunctionArgIter++)->getType(); 825 llvm::Value *UsrDataPointerAddr = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldUsr); 826 UsrData = Builder.CreatePointerCast(Builder.CreateLoad(UsrDataPointerAddr), UsrDataTy); 827 UsrData->setName("UsrData"); 828 } 829 830 llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock(); 831 llvm::PHINode *IV; 832 createLoop(Builder, Arg_x1, Arg_x2, &IV); 833 834 llvm::SmallVector<llvm::Value*, 8> CalleeArgs; 835 const int CalleeArgsContextIdx = ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs, 836 [&FunctionArgIter]() { FunctionArgIter++; }, 837 LoopHeader->getTerminator()); 838 839 bccAssert(FunctionArgIter == Function->arg_end()); 840 841 // Populate the actual call to kernel(). 842 llvm::SmallVector<llvm::Value*, 8> RootArgs; 843 844 llvm::Value *InPtr = nullptr; 845 llvm::Value *OutPtr = nullptr; 846 847 // Calculate the current input and output pointers 848 // 849 // We always calculate the input/output pointers with a GEP operating on i8 850 // values and only cast at the very end to OutTy. This is because the step 851 // between two values is given in bytes. 852 // 853 // TODO: We could further optimize the output by using a GEP operation of 854 // type 'OutTy' in cases where the element type of the allocation allows. 855 if (OutBasePtr) { 856 llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1); 857 OutOffset = Builder.CreateMul(OutOffset, OutStep); 858 OutPtr = Builder.CreateInBoundsGEP(OutBasePtr, OutOffset); 859 OutPtr = Builder.CreatePointerCast(OutPtr, OutTy); 860 } 861 862 if (InBufPtr) { 863 llvm::Value *InOffset = Builder.CreateSub(IV, Arg_x1); 864 InOffset = Builder.CreateMul(InOffset, InStep); 865 InPtr = Builder.CreateInBoundsGEP(InBufPtr, InOffset); 866 InPtr = Builder.CreatePointerCast(InPtr, InTy); 867 } 868 869 if (InPtr) { 870 RootArgs.push_back(InPtr); 871 } 872 873 if (OutPtr) { 874 RootArgs.push_back(OutPtr); 875 } 876 877 if (UsrData) { 878 RootArgs.push_back(UsrData); 879 } 880 881 finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder); 882 883 Builder.CreateCall(Function, RootArgs); 884 885 return true; 886 } 887 888 /* Expand a pass-by-value foreach kernel. 889 */ 890 bool ExpandForEach(llvm::Function *Function, uint32_t Signature) { 891 bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)); 892 ALOGV("Expanding kernel Function %s", Function->getName().str().c_str()); 893 894 // TODO: Refactor this to share functionality with ExpandOldStyleForEach. 895 llvm::DataLayout DL(Module); 896 897 llvm::Function *ExpandedFunction = 898 createEmptyExpandedForEachKernel(Function->getName()); 899 900 /* 901 * Extract the expanded function's parameters. It is guaranteed by 902 * createEmptyExpandedForEachKernel that there will be four parameters. 903 */ 904 905 bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams); 906 907 llvm::Function::arg_iterator ExpandedFunctionArgIter = 908 ExpandedFunction->arg_begin(); 909 910 llvm::Value *Arg_p = &*(ExpandedFunctionArgIter++); 911 llvm::Value *Arg_x1 = &*(ExpandedFunctionArgIter++); 912 llvm::Value *Arg_x2 = &*(ExpandedFunctionArgIter++); 913 // Arg_outstep is not used by expanded new-style forEach kernels. 914 915 // Construct the actual function body. 916 llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin()); 917 918 // Create TBAA meta-data. 919 llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, 920 *TBAAAllocation, *TBAAPointer; 921 llvm::MDBuilder MDHelper(*Context); 922 923 TBAARenderScriptDistinct = 924 MDHelper.createTBAARoot(kRenderScriptTBAARootName); 925 TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName, 926 TBAARenderScriptDistinct); 927 TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation", 928 TBAARenderScript); 929 TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation, 930 TBAAAllocation, 0); 931 TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer", 932 TBAARenderScript); 933 TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0); 934 935 /* 936 * Collect and construct the arguments for the kernel(). 937 * 938 * Note that we load any loop-invariant arguments before entering the Loop. 939 */ 940 size_t NumRemainingInputs = Function->arg_size(); 941 942 // No usrData parameter on kernels. 943 bccAssert( 944 !bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)); 945 946 llvm::Function::arg_iterator ArgIter = Function->arg_begin(); 947 948 // Check the return type 949 llvm::Type *OutTy = nullptr; 950 llvm::LoadInst *OutBasePtr = nullptr; 951 llvm::Value *CastedOutBasePtr = nullptr; 952 953 bool PassOutByPointer = false; 954 955 if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) { 956 llvm::Type *OutBaseTy = Function->getReturnType(); 957 958 if (OutBaseTy->isVoidTy()) { 959 PassOutByPointer = true; 960 OutTy = ArgIter->getType(); 961 962 ArgIter++; 963 --NumRemainingInputs; 964 } else { 965 // We don't increment Args, since we are using the actual return type. 966 OutTy = OutBaseTy->getPointerTo(); 967 } 968 969 SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0})); 970 OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep")); 971 972 if (gEnableRsTbaa) { 973 OutBasePtr->setMetadata("tbaa", TBAAPointer); 974 } 975 976 CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out"); 977 } 978 979 llvm::SmallVector<llvm::Value*, 8> InBufPtrs; 980 llvm::SmallVector<llvm::Value*, 8> InStructTempSlots; 981 982 bccAssert(NumRemainingInputs <= RS_KERNEL_INPUT_LIMIT); 983 984 // Create the loop structure. 985 llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock(); 986 llvm::PHINode *IV; 987 createLoop(Builder, Arg_x1, Arg_x2, &IV); 988 989 llvm::SmallVector<llvm::Value*, 8> CalleeArgs; 990 const int CalleeArgsContextIdx = 991 ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs, 992 [&NumRemainingInputs]() { --NumRemainingInputs; }, 993 LoopHeader->getTerminator()); 994 995 // After ExpandSpecialArguments() gets called, NumRemainingInputs 996 // counts the number of arguments to the kernel that correspond to 997 // an array entry from the InPtr field of the DriverInfo 998 // structure. 999 const size_t NumInPtrArguments = NumRemainingInputs; 1000 1001 if (NumInPtrArguments > 0) { 1002 ExpandInputsLoopInvariant(Builder, LoopHeader, Arg_p, TBAAPointer, ArgIter, NumInPtrArguments, 1003 InBufPtrs, InStructTempSlots); 1004 } 1005 1006 // Populate the actual call to kernel(). 1007 llvm::SmallVector<llvm::Value*, 8> RootArgs; 1008 1009 // Calculate the current input and output pointers. 1010 1011 // Output 1012 1013 llvm::Value *OutPtr = nullptr; 1014 if (CastedOutBasePtr) { 1015 llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1); 1016 OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffset); 1017 1018 if (PassOutByPointer) { 1019 RootArgs.push_back(OutPtr); 1020 } 1021 } 1022 1023 // Inputs 1024 1025 if (NumInPtrArguments > 0) { 1026 ExpandInputsBody(Builder, Arg_x1, TBAAAllocation, NumInPtrArguments, 1027 InBufPtrs, InStructTempSlots, IV, RootArgs); 1028 } 1029 1030 finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder); 1031 1032 llvm::Value *RetVal = Builder.CreateCall(Function, RootArgs); 1033 1034 if (OutPtr && !PassOutByPointer) { 1035 RetVal->setName("call.result"); 1036 llvm::StoreInst *Store = Builder.CreateStore(RetVal, OutPtr); 1037 if (gEnableRsTbaa) { 1038 Store->setMetadata("tbaa", TBAAAllocation); 1039 } 1040 } 1041 1042 return true; 1043 } 1044 1045 // Expand a simple reduce-style kernel function. 1046 // 1047 // The input is a kernel which represents a binary operation, 1048 // of the form 1049 // 1050 // define foo @func(foo %a, foo %b), 1051 // 1052 // (More generally, it can be of the forms 1053 // 1054 // define void @func(foo* %ret, foo* %a, foo* %b) 1055 // define void @func(foo* %ret, foo1 %a, foo1 %b) 1056 // define foo1 @func(foo2 %a, foo2 %b) 1057 // 1058 // as a result of argument / return value conversions. Here, "foo1" 1059 // and "foo2" refer to possibly coerced types, and the coerced 1060 // argument type may be different from the coerced return type. See 1061 // "Note on coercion" below.) 1062 // 1063 // Note also, we do not expect to encounter any case when the 1064 // arguments are promoted to pointers but the return value is 1065 // unpromoted to pointer, e.g. 1066 // 1067 // define foo1 @func(foo* %a, foo* %b) 1068 // 1069 // and we will throw an assertion in this case.) 1070 // 1071 // The input kernel gets expanded into a kernel of the form 1072 // 1073 // define void @func.expand(i8* %inBuf, i8* outBuf, i32 len) 1074 // 1075 // which performs a serial reduction of `len` elements from `inBuf`, 1076 // and stores the result into `outBuf`. In pseudocode, @func.expand 1077 // does: 1078 // 1079 // inArr := (foo *)inBuf; 1080 // accum := inArr[0]; 1081 // for (i := 1; i < len; ++i) { 1082 // accum := foo(accum, inArr[i]); 1083 // } 1084 // *(foo *)outBuf := accum; 1085 // 1086 // Note on coercion 1087 // 1088 // Both the return value and the argument types may undergo internal 1089 // coercion in clang as part of call lowering. As a result, the 1090 // return value type may differ from the argument type even if the 1091 // types in the RenderScript signaure are the same. For instance, the 1092 // kernel 1093 // 1094 // int3 add(int3 a, int3 b) { return a + b; } 1095 // 1096 // gets lowered by clang as 1097 // 1098 // define <3 x i32> @add(<4 x i32> %a.coerce, <4 x i32> %b.coerce) 1099 // 1100 // under AArch64. The details of this process are found in clang, 1101 // lib/CodeGen/TargetInfo.cpp, under classifyArgumentType() and 1102 // classifyReturnType() in ARMABIInfo, AArch64ABIInfo. If the value 1103 // is passed by pointer, then the pointed-to type is not coerced. 1104 // 1105 // Since we lack the original type information, this code does loads 1106 // and stores of allocation data by way of pointers to the coerced 1107 // type. 1108 bool ExpandReduce(llvm::Function *Function) { 1109 bccAssert(Function); 1110 1111 ALOGV("Expanding simple reduce kernel %s", Function->getName().str().c_str()); 1112 1113 llvm::DataLayout DL(Module); 1114 1115 // TBAA Metadata 1116 llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, *TBAAAllocation; 1117 llvm::MDBuilder MDHelper(*Context); 1118 1119 TBAARenderScriptDistinct = 1120 MDHelper.createTBAARoot(kRenderScriptTBAARootName); 1121 TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName, 1122 TBAARenderScriptDistinct); 1123 TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation", 1124 TBAARenderScript); 1125 TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation, 1126 TBAAAllocation, 0); 1127 1128 llvm::Function *ExpandedFunction = 1129 createEmptyExpandedReduceKernel(Function->getName()); 1130 1131 // Extract the expanded kernel's parameters. It is guaranteed by 1132 // createEmptyExpandedReduceKernel that there will be 3 parameters. 1133 auto ExpandedFunctionArgIter = ExpandedFunction->arg_begin(); 1134 1135 llvm::Value *Arg_inBuf = &*(ExpandedFunctionArgIter++); 1136 llvm::Value *Arg_outBuf = &*(ExpandedFunctionArgIter++); 1137 llvm::Value *Arg_len = &*(ExpandedFunctionArgIter++); 1138 1139 bccAssert(Function->arg_size() == 2 || Function->arg_size() == 3); 1140 1141 // Check if, instead of returning a value, the original kernel has 1142 // a pointer parameter which points to a temporary buffer into 1143 // which the return value gets written. 1144 const bool ReturnValuePointerStyle = (Function->arg_size() == 3); 1145 bccAssert(Function->getReturnType()->isVoidTy() == ReturnValuePointerStyle); 1146 1147 // Check if, instead of being passed by value, the inputs to the 1148 // original kernel are passed by pointer. 1149 auto FirstArgIter = Function->arg_begin(); 1150 // The second argument is always an input to the original kernel. 1151 auto SecondArgIter = std::next(FirstArgIter); 1152 const bool InputsPointerStyle = SecondArgIter->getType()->isPointerTy(); 1153 1154 // Get the output type (i.e. return type of the original kernel). 1155 llvm::PointerType *OutPtrTy = nullptr; 1156 llvm::Type *OutTy = nullptr; 1157 if (ReturnValuePointerStyle) { 1158 OutPtrTy = llvm::dyn_cast<llvm::PointerType>(FirstArgIter->getType()); 1159 bccAssert(OutPtrTy && "Expected a pointer parameter to kernel"); 1160 OutTy = OutPtrTy->getElementType(); 1161 } else { 1162 OutTy = Function->getReturnType(); 1163 bccAssert(!OutTy->isVoidTy()); 1164 OutPtrTy = OutTy->getPointerTo(); 1165 } 1166 1167 // Get the input type (type of the arguments to the original 1168 // kernel). Some input types are different from the output type, 1169 // due to explicit coercion that the compiler performs when 1170 // lowering the parameters. See "Note on coercion" above. 1171 llvm::PointerType *InPtrTy; 1172 llvm::Type *InTy; 1173 if (InputsPointerStyle) { 1174 InPtrTy = llvm::dyn_cast<llvm::PointerType>(SecondArgIter->getType()); 1175 bccAssert(InPtrTy && "Expected a pointer parameter to kernel"); 1176 bccAssert(ReturnValuePointerStyle); 1177 bccAssert(std::next(SecondArgIter)->getType() == InPtrTy && 1178 "Input type mismatch"); 1179 InTy = InPtrTy->getElementType(); 1180 } else { 1181 InTy = SecondArgIter->getType(); 1182 InPtrTy = InTy->getPointerTo(); 1183 if (!ReturnValuePointerStyle) { 1184 bccAssert(InTy == FirstArgIter->getType() && "Input type mismatch"); 1185 } else { 1186 bccAssert(InTy == std::next(SecondArgIter)->getType() && 1187 "Input type mismatch"); 1188 } 1189 } 1190 1191 // The input type should take up the same amount of space in 1192 // memory as the output type. 1193 bccAssert(DL.getTypeAllocSize(InTy) == DL.getTypeAllocSize(OutTy)); 1194 1195 // Construct the actual function body. 1196 llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin()); 1197 1198 // Cast input and output buffers to appropriate types. 1199 llvm::Value *InBuf = Builder.CreatePointerCast(Arg_inBuf, InPtrTy); 1200 llvm::Value *OutBuf = Builder.CreatePointerCast(Arg_outBuf, OutPtrTy); 1201 1202 // Create a slot to pass temporary results back. This needs to be 1203 // separate from the accumulator slot because the kernel may mark 1204 // the return value slot as noalias. 1205 llvm::Value *ReturnBuf = nullptr; 1206 if (ReturnValuePointerStyle) { 1207 ReturnBuf = Builder.CreateAlloca(OutTy, nullptr, "ret.tmp"); 1208 } 1209 1210 // Create a slot to hold the second input if the inputs are passed 1211 // by pointer to the original kernel. We cannot directly pass a 1212 // pointer to the input buffer, because the kernel may modify its 1213 // inputs. 1214 llvm::Value *SecondInputTempBuf = nullptr; 1215 if (InputsPointerStyle) { 1216 SecondInputTempBuf = Builder.CreateAlloca(InTy, nullptr, "in.tmp"); 1217 } 1218 1219 // Create a slot to accumulate temporary results, and fill it with 1220 // the first value. 1221 llvm::Value *AccumBuf = Builder.CreateAlloca(OutTy, nullptr, "accum"); 1222 // Cast to OutPtrTy before loading, since AccumBuf has type OutPtrTy. 1223 llvm::LoadInst *FirstElementLoad = Builder.CreateLoad( 1224 Builder.CreatePointerCast(InBuf, OutPtrTy)); 1225 if (gEnableRsTbaa) { 1226 FirstElementLoad->setMetadata("tbaa", TBAAAllocation); 1227 } 1228 // Memory operations with AccumBuf shouldn't be marked with 1229 // RenderScript TBAA, since this might conflict with TBAA metadata 1230 // in the kernel function when AccumBuf is passed by pointer. 1231 Builder.CreateStore(FirstElementLoad, AccumBuf); 1232 1233 // Loop body 1234 1235 // Create the loop structure. Note that the first input in the input buffer 1236 // has already been accumulated, so that we start at index 1. 1237 llvm::PHINode *IndVar; 1238 llvm::Value *Start = llvm::ConstantInt::get(Arg_len->getType(), 1); 1239 llvm::BasicBlock *Exit = createLoop(Builder, Start, Arg_len, &IndVar); 1240 1241 llvm::Value *InputPtr = Builder.CreateInBoundsGEP(InBuf, IndVar, "next_input.gep"); 1242 1243 // Set up arguments and call the original (unexpanded) kernel. 1244 // 1245 // The original kernel can have at most 3 arguments, which is 1246 // achieved when the signature looks like: 1247 // 1248 // define void @func(foo* %ret, bar %a, bar %b) 1249 // 1250 // (bar can be one of foo/foo.coerce/foo*). 1251 llvm::SmallVector<llvm::Value *, 3> KernelArgs; 1252 1253 if (ReturnValuePointerStyle) { 1254 KernelArgs.push_back(ReturnBuf); 1255 } 1256 1257 if (InputsPointerStyle) { 1258 bccAssert(ReturnValuePointerStyle); 1259 // Because the return buffer is copied back into the 1260 // accumulator, it's okay if the accumulator is overwritten. 1261 KernelArgs.push_back(AccumBuf); 1262 1263 llvm::LoadInst *InputLoad = Builder.CreateLoad(InputPtr); 1264 if (gEnableRsTbaa) { 1265 InputLoad->setMetadata("tbaa", TBAAAllocation); 1266 } 1267 Builder.CreateStore(InputLoad, SecondInputTempBuf); 1268 1269 KernelArgs.push_back(SecondInputTempBuf); 1270 } else { 1271 // InPtrTy may be different from OutPtrTy (the type of 1272 // AccumBuf), so first cast the accumulator buffer to the 1273 // pointer type corresponding to the input argument type. 1274 KernelArgs.push_back( 1275 Builder.CreateLoad(Builder.CreatePointerCast(AccumBuf, InPtrTy))); 1276 1277 llvm::LoadInst *LoadedArg = Builder.CreateLoad(InputPtr); 1278 if (gEnableRsTbaa) { 1279 LoadedArg->setMetadata("tbaa", TBAAAllocation); 1280 } 1281 KernelArgs.push_back(LoadedArg); 1282 } 1283 1284 llvm::Value *RetVal = Builder.CreateCall(Function, KernelArgs); 1285 1286 const uint64_t ElementSize = DL.getTypeStoreSize(OutTy); 1287 const uint64_t ElementAlign = DL.getABITypeAlignment(OutTy); 1288 1289 // Store the output in the accumulator. 1290 if (ReturnValuePointerStyle) { 1291 Builder.CreateMemCpy(AccumBuf, ReturnBuf, ElementSize, ElementAlign); 1292 } else { 1293 Builder.CreateStore(RetVal, AccumBuf); 1294 } 1295 1296 // Loop exit 1297 Builder.SetInsertPoint(Exit, Exit->begin()); 1298 1299 llvm::LoadInst *OutputLoad = Builder.CreateLoad(AccumBuf); 1300 llvm::StoreInst *OutputStore = Builder.CreateStore(OutputLoad, OutBuf); 1301 if (gEnableRsTbaa) { 1302 OutputStore->setMetadata("tbaa", TBAAAllocation); 1303 } 1304 1305 return true; 1306 } 1307 1308 // Certain categories of functions that make up a general 1309 // reduce-style kernel are called directly from the driver with no 1310 // expansion needed. For a function in such a category, we need to 1311 // promote linkage from static to external, to ensure that the 1312 // function is visible to the driver in the dynamic symbol table. 1313 // This promotion is safe because we don't have any kind of cross 1314 // translation unit linkage model (except for linking against 1315 // RenderScript libraries), so we do not risk name clashes. 1316 bool PromoteReduceNewFunction(const char *Name, FunctionSet &PromotedFunctions) { 1317 if (!Name) // a presumably-optional function that is not present 1318 return false; 1319 1320 llvm::Function *Fn = Module->getFunction(Name); 1321 bccAssert(Fn != nullptr); 1322 if (PromotedFunctions.insert(Fn).second) { 1323 bccAssert(Fn->getLinkage() == llvm::GlobalValue::InternalLinkage); 1324 Fn->setLinkage(llvm::GlobalValue::ExternalLinkage); 1325 return true; 1326 } 1327 1328 return false; 1329 } 1330 1331 // Expand the accumulator function for a general reduce-style kernel. 1332 // 1333 // The input is a function of the form 1334 // 1335 // define void @func(accumType* %accum, foo1 in1[, ... fooN inN] [, special arguments]) 1336 // 1337 // where all arguments except the first are the same as for a foreach kernel. 1338 // 1339 // The input accumulator function gets expanded into a function of the form 1340 // 1341 // define void @func.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, accumType* %accum) 1342 // 1343 // which performs a serial accumulaion of elements [x1, x2) into *%accum. 1344 // 1345 // In pseudocode, @func.expand does: 1346 // 1347 // for (i = %x1; i < %x2; ++i) { 1348 // func(%accum, 1349 // *((foo1 *)p->inPtr[0] + i)[, ... *((fooN *)p->inPtr[N-1] + i) 1350 // [, p] [, i] [, p->current.y] [, p->current.z]); 1351 // } 1352 // 1353 // This is very similar to foreach kernel expansion with no output. 1354 bool ExpandReduceNewAccumulator(llvm::Function *FnAccumulator, uint32_t Signature, size_t NumInputs) { 1355 ALOGV("Expanding accumulator %s for general reduce kernel", 1356 FnAccumulator->getName().str().c_str()); 1357 1358 // Create TBAA meta-data. 1359 llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, 1360 *TBAAAllocation, *TBAAPointer; 1361 llvm::MDBuilder MDHelper(*Context); 1362 TBAARenderScriptDistinct = 1363 MDHelper.createTBAARoot(kRenderScriptTBAARootName); 1364 TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName, 1365 TBAARenderScriptDistinct); 1366 TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation", 1367 TBAARenderScript); 1368 TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation, 1369 TBAAAllocation, 0); 1370 TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer", 1371 TBAARenderScript); 1372 TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0); 1373 1374 auto AccumulatorArgIter = FnAccumulator->arg_begin(); 1375 1376 // Create empty accumulator function. 1377 llvm::Function *FnExpandedAccumulator = 1378 createEmptyExpandedReduceNewAccumulator(FnAccumulator->getName(), 1379 (AccumulatorArgIter++)->getType()); 1380 1381 // Extract the expanded accumulator's parameters. It is 1382 // guaranteed by createEmptyExpandedReduceNewAccumulator that 1383 // there will be 4 parameters. 1384 bccAssert(FnExpandedAccumulator->arg_size() == kNumExpandedReduceNewAccumulatorParams); 1385 auto ExpandedAccumulatorArgIter = FnExpandedAccumulator->arg_begin(); 1386 llvm::Value *Arg_p = &*(ExpandedAccumulatorArgIter++); 1387 llvm::Value *Arg_x1 = &*(ExpandedAccumulatorArgIter++); 1388 llvm::Value *Arg_x2 = &*(ExpandedAccumulatorArgIter++); 1389 llvm::Value *Arg_accum = &*(ExpandedAccumulatorArgIter++); 1390 1391 // Construct the actual function body. 1392 llvm::IRBuilder<> Builder(FnExpandedAccumulator->getEntryBlock().begin()); 1393 1394 // Create the loop structure. 1395 llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock(); 1396 llvm::PHINode *IndVar; 1397 createLoop(Builder, Arg_x1, Arg_x2, &IndVar); 1398 1399 llvm::SmallVector<llvm::Value*, 8> CalleeArgs; 1400 const int CalleeArgsContextIdx = 1401 ExpandSpecialArguments(Signature, IndVar, Arg_p, Builder, CalleeArgs, 1402 [](){}, LoopHeader->getTerminator()); 1403 1404 llvm::SmallVector<llvm::Value*, 8> InBufPtrs; 1405 llvm::SmallVector<llvm::Value*, 8> InStructTempSlots; 1406 ExpandInputsLoopInvariant(Builder, LoopHeader, Arg_p, TBAAPointer, AccumulatorArgIter, NumInputs, 1407 InBufPtrs, InStructTempSlots); 1408 1409 // Populate the actual call to the original accumulator. 1410 llvm::SmallVector<llvm::Value*, 8> RootArgs; 1411 RootArgs.push_back(Arg_accum); 1412 ExpandInputsBody(Builder, Arg_x1, TBAAAllocation, NumInputs, InBufPtrs, InStructTempSlots, 1413 IndVar, RootArgs); 1414 finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *FnAccumulator, Builder); 1415 Builder.CreateCall(FnAccumulator, RootArgs); 1416 1417 return true; 1418 } 1419 1420 /// @brief Checks if pointers to allocation internals are exposed 1421 /// 1422 /// This function verifies if through the parameters passed to the kernel 1423 /// or through calls to the runtime library the script gains access to 1424 /// pointers pointing to data within a RenderScript Allocation. 1425 /// If we know we control all loads from and stores to data within 1426 /// RenderScript allocations and if we know the run-time internal accesses 1427 /// are all annotated with RenderScript TBAA metadata, only then we 1428 /// can safely use TBAA to distinguish between generic and from-allocation 1429 /// pointers. 1430 bool allocPointersExposed(llvm::Module &Module) { 1431 // Old style kernel function can expose pointers to elements within 1432 // allocations. 1433 // TODO: Extend analysis to allow simple cases of old-style kernels. 1434 for (size_t i = 0; i < mExportForEachCount; ++i) { 1435 const char *Name = mExportForEachNameList[i]; 1436 uint32_t Signature = mExportForEachSignatureList[i]; 1437 if (Module.getFunction(Name) && 1438 !bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)) { 1439 return true; 1440 } 1441 } 1442 1443 // Check for library functions that expose a pointer to an Allocation or 1444 // that are not yet annotated with RenderScript-specific tbaa information. 1445 static const std::vector<const char *> Funcs{ 1446 // rsGetElementAt(...) 1447 "_Z14rsGetElementAt13rs_allocationj", 1448 "_Z14rsGetElementAt13rs_allocationjj", 1449 "_Z14rsGetElementAt13rs_allocationjjj", 1450 1451 // rsSetElementAt() 1452 "_Z14rsSetElementAt13rs_allocationPvj", 1453 "_Z14rsSetElementAt13rs_allocationPvjj", 1454 "_Z14rsSetElementAt13rs_allocationPvjjj", 1455 1456 // rsGetElementAtYuv_uchar_Y() 1457 "_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj", 1458 1459 // rsGetElementAtYuv_uchar_U() 1460 "_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj", 1461 1462 // rsGetElementAtYuv_uchar_V() 1463 "_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj", 1464 }; 1465 1466 for (auto FI : Funcs) { 1467 llvm::Function *Function = Module.getFunction(FI); 1468 1469 if (!Function) { 1470 ALOGE("Missing run-time function '%s'", FI); 1471 return true; 1472 } 1473 1474 if (Function->getNumUses() > 0) { 1475 return true; 1476 } 1477 } 1478 1479 return false; 1480 } 1481 1482 /// @brief Connect RenderScript TBAA metadata to C/C++ metadata 1483 /// 1484 /// The TBAA metadata used to annotate loads/stores from RenderScript 1485 /// Allocations is generated in a separate TBAA tree with a 1486 /// "RenderScript Distinct TBAA" root node. LLVM does assume may-alias for 1487 /// all nodes in unrelated alias analysis trees. This function makes the 1488 /// "RenderScript TBAA" node (which is parented by the Distinct TBAA root), 1489 /// a subtree of the normal C/C++ TBAA tree aside of normal C/C++ types. With 1490 /// the connected trees every access to an Allocation is resolved to 1491 /// must-alias if compared to a normal C/C++ access. 1492 void connectRenderScriptTBAAMetadata(llvm::Module &Module) { 1493 llvm::MDBuilder MDHelper(*Context); 1494 llvm::MDNode *TBAARenderScriptDistinct = 1495 MDHelper.createTBAARoot("RenderScript Distinct TBAA"); 1496 llvm::MDNode *TBAARenderScript = MDHelper.createTBAANode( 1497 "RenderScript TBAA", TBAARenderScriptDistinct); 1498 llvm::MDNode *TBAARoot = MDHelper.createTBAARoot("Simple C/C++ TBAA"); 1499 TBAARenderScript->replaceOperandWith(1, TBAARoot); 1500 } 1501 1502 virtual bool runOnModule(llvm::Module &Module) { 1503 bool Changed = false; 1504 this->Module = &Module; 1505 Context = &Module.getContext(); 1506 1507 buildTypes(); 1508 1509 bcinfo::MetadataExtractor me(&Module); 1510 if (!me.extract()) { 1511 ALOGE("Could not extract metadata from module!"); 1512 return false; 1513 } 1514 1515 // Expand forEach_* style kernels. 1516 mExportForEachCount = me.getExportForEachSignatureCount(); 1517 mExportForEachNameList = me.getExportForEachNameList(); 1518 mExportForEachSignatureList = me.getExportForEachSignatureList(); 1519 1520 for (size_t i = 0; i < mExportForEachCount; ++i) { 1521 const char *name = mExportForEachNameList[i]; 1522 uint32_t signature = mExportForEachSignatureList[i]; 1523 llvm::Function *kernel = Module.getFunction(name); 1524 if (kernel) { 1525 if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) { 1526 Changed |= ExpandForEach(kernel, signature); 1527 kernel->setLinkage(llvm::GlobalValue::InternalLinkage); 1528 } else if (kernel->getReturnType()->isVoidTy()) { 1529 Changed |= ExpandOldStyleForEach(kernel, signature); 1530 kernel->setLinkage(llvm::GlobalValue::InternalLinkage); 1531 } else { 1532 // There are some graphics root functions that are not 1533 // expanded, but that will be called directly. For those 1534 // functions, we can not set the linkage to internal. 1535 } 1536 } 1537 } 1538 1539 // Expand simple reduce_* style kernels. 1540 mExportReduceCount = me.getExportReduceCount(); 1541 mExportReduceNameList = me.getExportReduceNameList(); 1542 1543 for (size_t i = 0; i < mExportReduceCount; ++i) { 1544 llvm::Function *kernel = Module.getFunction(mExportReduceNameList[i]); 1545 if (kernel) { 1546 Changed |= ExpandReduce(kernel); 1547 } 1548 } 1549 1550 // Process general reduce_* style functions. 1551 const size_t ExportReduceNewCount = me.getExportReduceNewCount(); 1552 const bcinfo::MetadataExtractor::ReduceNew *ExportReduceNewList = me.getExportReduceNewList(); 1553 // Note that functions can be shared between kernels 1554 FunctionSet PromotedFunctions, ExpandedAccumulators; 1555 1556 for (size_t i = 0; i < ExportReduceNewCount; ++i) { 1557 Changed |= PromoteReduceNewFunction(ExportReduceNewList[i].mInitializerName, PromotedFunctions); 1558 Changed |= PromoteReduceNewFunction(ExportReduceNewList[i].mOutConverterName, PromotedFunctions); 1559 1560 // Accumulator 1561 llvm::Function *accumulator = Module.getFunction(ExportReduceNewList[i].mAccumulatorName); 1562 bccAssert(accumulator != nullptr); 1563 if (ExpandedAccumulators.insert(accumulator).second) 1564 Changed |= ExpandReduceNewAccumulator(accumulator, 1565 ExportReduceNewList[i].mSignature, 1566 ExportReduceNewList[i].mInputCount); 1567 } 1568 1569 if (gEnableRsTbaa && !allocPointersExposed(Module)) { 1570 connectRenderScriptTBAAMetadata(Module); 1571 } 1572 1573 return Changed; 1574 } 1575 1576 virtual const char *getPassName() const { 1577 return "forEach_* and reduce_* function expansion"; 1578 } 1579 1580}; // end RSKernelExpandPass 1581 1582} // end anonymous namespace 1583 1584char RSKernelExpandPass::ID = 0; 1585static llvm::RegisterPass<RSKernelExpandPass> X("kernelexp", "Kernel Expand Pass"); 1586 1587namespace bcc { 1588 1589llvm::ModulePass * 1590createRSKernelExpandPass(bool pEnableStepOpt) { 1591 return new RSKernelExpandPass(pEnableStepOpt); 1592} 1593 1594} // end namespace bcc 1595