RSKernelExpand.cpp revision ba1a8f1e6f3eb5b7069e9ba1575f16e393c84c23
1/* 2 * Copyright 2012, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "bcc/Assert.h" 18#include "bcc/Renderscript/RSTransforms.h" 19 20#include <cstdlib> 21#include <functional> 22#include <unordered_set> 23 24#include <llvm/IR/DerivedTypes.h> 25#include <llvm/IR/Function.h> 26#include <llvm/IR/Instructions.h> 27#include <llvm/IR/IRBuilder.h> 28#include <llvm/IR/MDBuilder.h> 29#include <llvm/IR/Module.h> 30#include <llvm/Pass.h> 31#include <llvm/Support/raw_ostream.h> 32#include <llvm/IR/DataLayout.h> 33#include <llvm/IR/Function.h> 34#include <llvm/IR/Type.h> 35#include <llvm/Transforms/Utils/BasicBlockUtils.h> 36 37#include "bcc/Config/Config.h" 38#include "bcc/Support/Log.h" 39 40#include "bcinfo/MetadataExtractor.h" 41 42#ifndef __DISABLE_ASSERTS 43// Only used in bccAssert() 44const int kNumExpandedForeachParams = 4; 45const int kNumExpandedReduceParams = 3; 46const int kNumExpandedReduceNewAccumulatorParams = 4; 47#endif 48 49const char kRenderScriptTBAARootName[] = "RenderScript Distinct TBAA"; 50const char kRenderScriptTBAANodeName[] = "RenderScript TBAA"; 51 52using namespace bcc; 53 54namespace { 55 56static const bool gEnableRsTbaa = true; 57 58/* RSKernelExpandPass - This pass operates on functions that are able 59 * to be called via rsForEach(), "foreach_<NAME>", or 60 * "reduce_<NAME>". We create an inner loop for the function to be 61 * invoked over the appropriate data cells of the input/output 62 * allocations (adjusting other relevant parameters as we go). We 63 * support doing this for any forEach or reduce style compute 64 * kernels. The new function name is the original function name 65 * followed by ".expand". Note that we still generate code for the 66 * original function. 67 */ 68class RSKernelExpandPass : public llvm::ModulePass { 69public: 70 static char ID; 71 72private: 73 static const size_t RS_KERNEL_INPUT_LIMIT = 8; // see frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h 74 75 typedef std::unordered_set<llvm::Function *> FunctionSet; 76 77 enum RsLaunchDimensionsField { 78 RsLaunchDimensionsFieldX, 79 RsLaunchDimensionsFieldY, 80 RsLaunchDimensionsFieldZ, 81 RsLaunchDimensionsFieldLod, 82 RsLaunchDimensionsFieldFace, 83 RsLaunchDimensionsFieldArray, 84 85 RsLaunchDimensionsFieldCount 86 }; 87 88 enum RsExpandKernelDriverInfoPfxField { 89 RsExpandKernelDriverInfoPfxFieldInPtr, 90 RsExpandKernelDriverInfoPfxFieldInStride, 91 RsExpandKernelDriverInfoPfxFieldInLen, 92 RsExpandKernelDriverInfoPfxFieldOutPtr, 93 RsExpandKernelDriverInfoPfxFieldOutStride, 94 RsExpandKernelDriverInfoPfxFieldOutLen, 95 RsExpandKernelDriverInfoPfxFieldDim, 96 RsExpandKernelDriverInfoPfxFieldCurrent, 97 RsExpandKernelDriverInfoPfxFieldUsr, 98 RsExpandKernelDriverInfoPfxFieldUsLenr, 99 100 RsExpandKernelDriverInfoPfxFieldCount 101 }; 102 103 llvm::Module *Module; 104 llvm::LLVMContext *Context; 105 106 /* 107 * Pointers to LLVM type information for the the function signatures 108 * for expanded functions. These must be re-calculated for each module 109 * the pass is run on. 110 */ 111 llvm::FunctionType *ExpandedForEachType, *ExpandedReduceType; 112 llvm::Type *RsExpandKernelDriverInfoPfxTy; 113 114 uint32_t mExportForEachCount; 115 const char **mExportForEachNameList; 116 const uint32_t *mExportForEachSignatureList; 117 118 uint32_t mExportReduceCount; 119 const char **mExportReduceNameList; 120 121 // Turns on optimization of allocation stride values. 122 bool mEnableStepOpt; 123 124 uint32_t getRootSignature(llvm::Function *Function) { 125 const llvm::NamedMDNode *ExportForEachMetadata = 126 Module->getNamedMetadata("#rs_export_foreach"); 127 128 if (!ExportForEachMetadata) { 129 llvm::SmallVector<llvm::Type*, 8> RootArgTys; 130 for (llvm::Function::arg_iterator B = Function->arg_begin(), 131 E = Function->arg_end(); 132 B != E; 133 ++B) { 134 RootArgTys.push_back(B->getType()); 135 } 136 137 // For pre-ICS bitcode, we may not have signature information. In that 138 // case, we use the size of the RootArgTys to select the number of 139 // arguments. 140 return (1 << RootArgTys.size()) - 1; 141 } 142 143 if (ExportForEachMetadata->getNumOperands() == 0) { 144 return 0; 145 } 146 147 bccAssert(ExportForEachMetadata->getNumOperands() > 0); 148 149 // We only handle the case for legacy root() functions here, so this is 150 // hard-coded to look at only the first such function. 151 llvm::MDNode *SigNode = ExportForEachMetadata->getOperand(0); 152 if (SigNode != nullptr && SigNode->getNumOperands() == 1) { 153 llvm::Metadata *SigMD = SigNode->getOperand(0); 154 if (llvm::MDString *SigS = llvm::dyn_cast<llvm::MDString>(SigMD)) { 155 llvm::StringRef SigString = SigS->getString(); 156 uint32_t Signature = 0; 157 if (SigString.getAsInteger(10, Signature)) { 158 ALOGE("Non-integer signature value '%s'", SigString.str().c_str()); 159 return 0; 160 } 161 return Signature; 162 } 163 } 164 165 return 0; 166 } 167 168 bool isStepOptSupported(llvm::Type *AllocType) { 169 170 llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType); 171 llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context); 172 173 if (mEnableStepOpt) { 174 return false; 175 } 176 177 if (AllocType == VoidPtrTy) { 178 return false; 179 } 180 181 if (!PT) { 182 return false; 183 } 184 185 // remaining conditions are 64-bit only 186 if (VoidPtrTy->getPrimitiveSizeInBits() == 32) { 187 return true; 188 } 189 190 // coerce suggests an upconverted struct type, which we can't support 191 if (AllocType->getStructName().find("coerce") != llvm::StringRef::npos) { 192 return false; 193 } 194 195 // 2xi64 and i128 suggest an upconverted struct type, which are also unsupported 196 llvm::Type *V2xi64Ty = llvm::VectorType::get(llvm::Type::getInt64Ty(*Context), 2); 197 llvm::Type *Int128Ty = llvm::Type::getIntNTy(*Context, 128); 198 if (AllocType == V2xi64Ty || AllocType == Int128Ty) { 199 return false; 200 } 201 202 return true; 203 } 204 205 // Get the actual value we should use to step through an allocation. 206 // 207 // Normally the value we use to step through an allocation is given to us by 208 // the driver. However, for certain primitive data types, we can derive an 209 // integer constant for the step value. We use this integer constant whenever 210 // possible to allow further compiler optimizations to take place. 211 // 212 // DL - Target Data size/layout information. 213 // T - Type of allocation (should be a pointer). 214 // OrigStep - Original step increment (root.expand() input from driver). 215 llvm::Value *getStepValue(llvm::DataLayout *DL, llvm::Type *AllocType, 216 llvm::Value *OrigStep) { 217 bccAssert(DL); 218 bccAssert(AllocType); 219 bccAssert(OrigStep); 220 llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType); 221 if (isStepOptSupported(AllocType)) { 222 llvm::Type *ET = PT->getElementType(); 223 uint64_t ETSize = DL->getTypeAllocSize(ET); 224 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 225 return llvm::ConstantInt::get(Int32Ty, ETSize); 226 } else { 227 return OrigStep; 228 } 229 } 230 231 /// Builds the types required by the pass for the given context. 232 void buildTypes(void) { 233 // Create the RsLaunchDimensionsTy and RsExpandKernelDriverInfoPfxTy structs. 234 235 llvm::Type *Int8Ty = llvm::Type::getInt8Ty(*Context); 236 llvm::Type *Int8PtrTy = Int8Ty->getPointerTo(); 237 llvm::Type *Int8PtrArrayInputLimitTy = llvm::ArrayType::get(Int8PtrTy, RS_KERNEL_INPUT_LIMIT); 238 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 239 llvm::Type *Int32ArrayInputLimitTy = llvm::ArrayType::get(Int32Ty, RS_KERNEL_INPUT_LIMIT); 240 llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context); 241 llvm::Type *Int32Array4Ty = llvm::ArrayType::get(Int32Ty, 4); 242 243 /* Defined in frameworks/base/libs/rs/cpu_ref/rsCpuCore.h: 244 * 245 * struct RsLaunchDimensions { 246 * uint32_t x; 247 * uint32_t y; 248 * uint32_t z; 249 * uint32_t lod; 250 * uint32_t face; 251 * uint32_t array[4]; 252 * }; 253 */ 254 llvm::SmallVector<llvm::Type*, RsLaunchDimensionsFieldCount> RsLaunchDimensionsTypes; 255 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t x 256 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t y 257 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t z 258 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t lod 259 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t face 260 RsLaunchDimensionsTypes.push_back(Int32Array4Ty); // uint32_t array[4] 261 llvm::StructType *RsLaunchDimensionsTy = 262 llvm::StructType::create(RsLaunchDimensionsTypes, "RsLaunchDimensions"); 263 264 /* Defined as the beginning of RsExpandKernelDriverInfo in frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h: 265 * 266 * struct RsExpandKernelDriverInfoPfx { 267 * const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT]; 268 * uint32_t inStride[RS_KERNEL_INPUT_LIMIT]; 269 * uint32_t inLen; 270 * 271 * uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT]; 272 * uint32_t outStride[RS_KERNEL_INPUT_LIMIT]; 273 * uint32_t outLen; 274 * 275 * // Dimension of the launch 276 * RsLaunchDimensions dim; 277 * 278 * // The walking iterator of the launch 279 * RsLaunchDimensions current; 280 * 281 * const void *usr; 282 * uint32_t usrLen; 283 * 284 * // Items below this line are not used by the compiler and can be change in the driver. 285 * // So the compiler must assume there are an unknown number of fields of unknown type 286 * // beginning here. 287 * }; 288 * 289 * The name "RsExpandKernelDriverInfoPfx" is known to RSInvariantPass (RSInvariant.cpp). 290 */ 291 llvm::SmallVector<llvm::Type*, RsExpandKernelDriverInfoPfxFieldCount> RsExpandKernelDriverInfoPfxTypes; 292 RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT] 293 RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy); // uint32_t inStride[RS_KERNEL_INPUT_LIMIT] 294 RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t inLen 295 RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT] 296 RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy); // uint32_t outStride[RS_KERNEL_INPUT_LIMIT] 297 RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t outLen 298 RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy); // RsLaunchDimensions dim 299 RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy); // RsLaunchDimensions current 300 RsExpandKernelDriverInfoPfxTypes.push_back(VoidPtrTy); // const void *usr 301 RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t usrLen 302 RsExpandKernelDriverInfoPfxTy = 303 llvm::StructType::create(RsExpandKernelDriverInfoPfxTypes, "RsExpandKernelDriverInfoPfx"); 304 305 // Create the function type for expanded kernels. 306 llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context); 307 308 llvm::Type *RsExpandKernelDriverInfoPfxPtrTy = RsExpandKernelDriverInfoPfxTy->getPointerTo(); 309 // void (const RsExpandKernelDriverInfoPfxTy *p, uint32_t x1, uint32_t x2, uint32_t outstep) 310 ExpandedForEachType = llvm::FunctionType::get(VoidTy, 311 {RsExpandKernelDriverInfoPfxPtrTy, Int32Ty, Int32Ty, Int32Ty}, false); 312 313 // void (void *inBuf, void *outBuf, uint32_t len) 314 ExpandedReduceType = llvm::FunctionType::get(VoidTy, {VoidPtrTy, VoidPtrTy, Int32Ty}, false); 315 } 316 317 /// @brief Create skeleton of the expanded foreach kernel. 318 /// 319 /// This creates a function with the following signature: 320 /// 321 /// void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2, 322 /// uint32_t outstep) 323 /// 324 llvm::Function *createEmptyExpandedForEachKernel(llvm::StringRef OldName) { 325 llvm::Function *ExpandedFunction = 326 llvm::Function::Create(ExpandedForEachType, 327 llvm::GlobalValue::ExternalLinkage, 328 OldName + ".expand", Module); 329 bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams); 330 llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin(); 331 (AI++)->setName("p"); 332 (AI++)->setName("x1"); 333 (AI++)->setName("x2"); 334 (AI++)->setName("arg_outstep"); 335 llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin", 336 ExpandedFunction); 337 llvm::IRBuilder<> Builder(Begin); 338 Builder.CreateRetVoid(); 339 return ExpandedFunction; 340 } 341 342 // Create skeleton of the expanded reduce kernel. 343 // 344 // This creates a function with the following signature: 345 // 346 // void @func.expand(i8* nocapture %inBuf, i8* nocapture %outBuf, i32 len) 347 // 348 llvm::Function *createEmptyExpandedReduceKernel(llvm::StringRef OldName) { 349 llvm::Function *ExpandedFunction = 350 llvm::Function::Create(ExpandedReduceType, 351 llvm::GlobalValue::ExternalLinkage, 352 OldName + ".expand", Module); 353 bccAssert(ExpandedFunction->arg_size() == kNumExpandedReduceParams); 354 355 llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin(); 356 357 using llvm::Attribute; 358 359 llvm::Argument *InBuf = &(*AI++); 360 InBuf->setName("inBuf"); 361 InBuf->addAttr(llvm::AttributeSet::get(*Context, InBuf->getArgNo() + 1, llvm::makeArrayRef(Attribute::NoCapture))); 362 363 llvm::Argument *OutBuf = &(*AI++); 364 OutBuf->setName("outBuf"); 365 OutBuf->addAttr(llvm::AttributeSet::get(*Context, OutBuf->getArgNo() + 1, llvm::makeArrayRef(Attribute::NoCapture))); 366 367 (AI++)->setName("len"); 368 369 llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin", 370 ExpandedFunction); 371 llvm::IRBuilder<> Builder(Begin); 372 Builder.CreateRetVoid(); 373 374 return ExpandedFunction; 375 } 376 377 // Create skeleton of a general reduce kernel's expanded accumulator. 378 // 379 // This creates a function with the following signature: 380 // 381 // void @func.expand(%RsExpandKernelDriverInfoPfx* nocapture %p, 382 // i32 %x1, i32 %x2, accumType* nocapture %accum) 383 // 384 llvm::Function *createEmptyExpandedReduceNewAccumulator(llvm::StringRef OldName, 385 llvm::Type *AccumArgTy) { 386 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 387 llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context); 388 llvm::FunctionType *ExpandedReduceNewAccumulatorType = 389 llvm::FunctionType::get(VoidTy, 390 {RsExpandKernelDriverInfoPfxTy->getPointerTo(), 391 Int32Ty, Int32Ty, AccumArgTy}, false); 392 llvm::Function *FnExpandedAccumulator = 393 llvm::Function::Create(ExpandedReduceNewAccumulatorType, 394 llvm::GlobalValue::ExternalLinkage, 395 OldName + ".expand", Module); 396 bccAssert(FnExpandedAccumulator->arg_size() == kNumExpandedReduceNewAccumulatorParams); 397 398 llvm::Function::arg_iterator AI = FnExpandedAccumulator->arg_begin(); 399 400 using llvm::Attribute; 401 402 llvm::Argument *Arg_p = &(*AI++); 403 Arg_p->setName("p"); 404 Arg_p->addAttr(llvm::AttributeSet::get(*Context, Arg_p->getArgNo() + 1, 405 llvm::makeArrayRef(Attribute::NoCapture))); 406 407 llvm::Argument *Arg_x1 = &(*AI++); 408 Arg_x1->setName("x1"); 409 410 llvm::Argument *Arg_x2 = &(*AI++); 411 Arg_x2->setName("x2"); 412 413 llvm::Argument *Arg_accum = &(*AI++); 414 Arg_accum->setName("accum"); 415 Arg_accum->addAttr(llvm::AttributeSet::get(*Context, Arg_accum->getArgNo() + 1, 416 llvm::makeArrayRef(Attribute::NoCapture))); 417 418 llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin", 419 FnExpandedAccumulator); 420 llvm::IRBuilder<> Builder(Begin); 421 Builder.CreateRetVoid(); 422 423 return FnExpandedAccumulator; 424 } 425 426 /// @brief Create an empty loop 427 /// 428 /// Create a loop of the form: 429 /// 430 /// for (i = LowerBound; i < UpperBound; i++) 431 /// ; 432 /// 433 /// After the loop has been created, the builder is set such that 434 /// instructions can be added to the loop body. 435 /// 436 /// @param Builder The builder to use to build this loop. The current 437 /// position of the builder is the position the loop 438 /// will be inserted. 439 /// @param LowerBound The first value of the loop iterator 440 /// @param UpperBound The maximal value of the loop iterator 441 /// @param LoopIV A reference that will be set to the loop iterator. 442 /// @return The BasicBlock that will be executed after the loop. 443 llvm::BasicBlock *createLoop(llvm::IRBuilder<> &Builder, 444 llvm::Value *LowerBound, 445 llvm::Value *UpperBound, 446 llvm::Value **LoopIV) { 447 bccAssert(LowerBound->getType() == UpperBound->getType()); 448 449 llvm::BasicBlock *CondBB, *AfterBB, *HeaderBB; 450 llvm::Value *Cond, *IVNext, *IV, *IVVar; 451 452 CondBB = Builder.GetInsertBlock(); 453 AfterBB = llvm::SplitBlock(CondBB, Builder.GetInsertPoint(), nullptr, nullptr); 454 HeaderBB = llvm::BasicBlock::Create(*Context, "Loop", CondBB->getParent()); 455 456 CondBB->getTerminator()->eraseFromParent(); 457 Builder.SetInsertPoint(CondBB); 458 459 // decltype(LowerBound) *ivvar = alloca(sizeof(int)) 460 // *ivvar = LowerBound 461 IVVar = Builder.CreateAlloca(LowerBound->getType(), nullptr, BCC_INDEX_VAR_NAME); 462 Builder.CreateStore(LowerBound, IVVar); 463 464 // if (LowerBound < Upperbound) 465 // goto LoopHeader 466 // else 467 // goto AfterBB 468 Cond = Builder.CreateICmpULT(LowerBound, UpperBound); 469 Builder.CreateCondBr(Cond, HeaderBB, AfterBB); 470 471 // LoopHeader: 472 // iv = *ivvar 473 // <insertion point here> 474 // iv.next = iv + 1 475 // *ivvar = iv.next 476 // if (iv.next < Upperbound) 477 // goto LoopHeader 478 // else 479 // goto AfterBB 480 // AfterBB: 481 Builder.SetInsertPoint(HeaderBB); 482 IV = Builder.CreateLoad(IVVar, "X"); 483 IVNext = Builder.CreateNUWAdd(IV, Builder.getInt32(1)); 484 Builder.CreateStore(IVNext, IVVar); 485 Cond = Builder.CreateICmpULT(IVNext, UpperBound); 486 Builder.CreateCondBr(Cond, HeaderBB, AfterBB); 487 AfterBB->setName("Exit"); 488 Builder.SetInsertPoint(llvm::cast<llvm::Instruction>(IVNext)); 489 490 // Record information about this loop. 491 *LoopIV = IV; 492 return AfterBB; 493 } 494 495 // Finish building the outgoing argument list for calling a ForEach-able function. 496 // 497 // ArgVector - on input, the non-special arguments 498 // on output, the non-special arguments combined with the special arguments 499 // from SpecialArgVector 500 // SpecialArgVector - special arguments (from ExpandSpecialArguments()) 501 // SpecialArgContextIdx - return value of ExpandSpecialArguments() 502 // (position of context argument in SpecialArgVector) 503 // CalleeFunction - the ForEach-able function being called 504 // Builder - for inserting code into the caller function 505 template<unsigned int ArgVectorLen, unsigned int SpecialArgVectorLen> 506 void finishArgList( llvm::SmallVector<llvm::Value *, ArgVectorLen> &ArgVector, 507 const llvm::SmallVector<llvm::Value *, SpecialArgVectorLen> &SpecialArgVector, 508 const int SpecialArgContextIdx, 509 const llvm::Function &CalleeFunction, 510 llvm::IRBuilder<> &CallerBuilder) { 511 /* The context argument (if any) is a pointer to an opaque user-visible type that differs from 512 * the RsExpandKernelDriverInfoPfx type used in the function we are generating (although the 513 * two types represent the same thing). Therefore, we must introduce a pointer cast when 514 * generating a call to the kernel function. 515 */ 516 const int ArgContextIdx = 517 SpecialArgContextIdx >= 0 ? (ArgVector.size() + SpecialArgContextIdx) : SpecialArgContextIdx; 518 ArgVector.append(SpecialArgVector.begin(), SpecialArgVector.end()); 519 if (ArgContextIdx >= 0) { 520 llvm::Type *ContextArgType = nullptr; 521 int ArgIdx = ArgContextIdx; 522 for (const auto &Arg : CalleeFunction.getArgumentList()) { 523 if (!ArgIdx--) { 524 ContextArgType = Arg.getType(); 525 break; 526 } 527 } 528 bccAssert(ContextArgType); 529 ArgVector[ArgContextIdx] = CallerBuilder.CreatePointerCast(ArgVector[ArgContextIdx], ContextArgType); 530 } 531 } 532 533 // GEPHelper() returns a SmallVector of values suitable for passing 534 // to IRBuilder::CreateGEP(), and SmallGEPIndices is a typedef for 535 // the returned data type. It is sized so that the SmallVector 536 // returned by GEPHelper() never needs to do a heap allocation for 537 // any list of GEP indices it encounters in the code. 538 typedef llvm::SmallVector<llvm::Value *, 3> SmallGEPIndices; 539 540 // Helper for turning a list of constant integer GEP indices into a 541 // SmallVector of llvm::Value*. The return value is suitable for 542 // passing to a GetElementPtrInst constructor or IRBuilder::CreateGEP(). 543 // 544 // Inputs: 545 // I32Args should be integers which represent the index arguments 546 // to a GEP instruction. 547 // 548 // Returns: 549 // Returns a SmallVector of ConstantInts. 550 SmallGEPIndices GEPHelper(const std::initializer_list<int32_t> I32Args) { 551 SmallGEPIndices Out(I32Args.size()); 552 llvm::IntegerType *I32Ty = llvm::Type::getInt32Ty(*Context); 553 std::transform(I32Args.begin(), I32Args.end(), Out.begin(), 554 [I32Ty](int32_t Arg) { return llvm::ConstantInt::get(I32Ty, Arg); }); 555 return Out; 556 } 557 558public: 559 RSKernelExpandPass(bool pEnableStepOpt = true) 560 : ModulePass(ID), Module(nullptr), Context(nullptr), 561 mEnableStepOpt(pEnableStepOpt) { 562 563 } 564 565 virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override { 566 // This pass does not use any other analysis passes, but it does 567 // add/wrap the existing functions in the module (thus altering the CFG). 568 } 569 570 // Build contribution to outgoing argument list for calling a 571 // ForEach-able function or a general reduction accumulator 572 // function, based on the special parameters of that function. 573 // 574 // Signature - metadata bits for the signature of the callee 575 // X, Arg_p - values derived directly from expanded function, 576 // suitable for computing arguments for the callee 577 // CalleeArgs - contribution is accumulated here 578 // Bump - invoked once for each contributed outgoing argument 579 // LoopHeaderInsertionPoint - an Instruction in the loop header, before which 580 // this function can insert loop-invariant loads 581 // 582 // Return value is the (zero-based) position of the context (Arg_p) 583 // argument in the CalleeArgs vector, or a negative value if the 584 // context argument is not placed in the CalleeArgs vector. 585 int ExpandSpecialArguments(uint32_t Signature, 586 llvm::Value *X, 587 llvm::Value *Arg_p, 588 llvm::IRBuilder<> &Builder, 589 llvm::SmallVector<llvm::Value*, 8> &CalleeArgs, 590 std::function<void ()> Bump, 591 llvm::Instruction *LoopHeaderInsertionPoint) { 592 593 bccAssert(CalleeArgs.empty()); 594 595 int Return = -1; 596 if (bcinfo::MetadataExtractor::hasForEachSignatureCtxt(Signature)) { 597 CalleeArgs.push_back(Arg_p); 598 Bump(); 599 Return = CalleeArgs.size() - 1; 600 } 601 602 if (bcinfo::MetadataExtractor::hasForEachSignatureX(Signature)) { 603 CalleeArgs.push_back(X); 604 Bump(); 605 } 606 607 if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature) || 608 bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) { 609 bccAssert(LoopHeaderInsertionPoint); 610 611 // Y and Z are loop invariant, so they can be hoisted out of the 612 // loop. Set the IRBuilder insertion point to the loop header. 613 auto OldInsertionPoint = Builder.saveIP(); 614 Builder.SetInsertPoint(LoopHeaderInsertionPoint); 615 616 if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature)) { 617 SmallGEPIndices YValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent, 618 RsLaunchDimensionsFieldY})); 619 llvm::Value *YAddr = Builder.CreateInBoundsGEP(Arg_p, YValueGEP, "Y.gep"); 620 CalleeArgs.push_back(Builder.CreateLoad(YAddr, "Y")); 621 Bump(); 622 } 623 624 if (bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) { 625 SmallGEPIndices ZValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent, 626 RsLaunchDimensionsFieldZ})); 627 llvm::Value *ZAddr = Builder.CreateInBoundsGEP(Arg_p, ZValueGEP, "Z.gep"); 628 CalleeArgs.push_back(Builder.CreateLoad(ZAddr, "Z")); 629 Bump(); 630 } 631 632 Builder.restoreIP(OldInsertionPoint); 633 } 634 635 return Return; 636 } 637 638 // Generate loop-invariant input processing setup code for an expanded 639 // ForEach-able function or an expanded general reduction accumulator 640 // function. 641 // 642 // LoopHeader - block at the end of which the setup code will be inserted 643 // Arg_p - RSKernelDriverInfo pointer passed to the expanded function 644 // TBAAPointer - metadata for marking loads of pointer values out of RSKernelDriverInfo 645 // ArgIter - iterator pointing to first input of the UNexpanded function 646 // NumInputs - number of inputs (NOT number of ARGUMENTS) 647 // 648 // InBufPtrs[] - this function sets each array element to point to the first 649 // cell of the corresponding input allocation 650 // InStructTempSlots[] - this function sets each array element either to nullptr 651 // or to the result of an alloca (for the case where the 652 // calling convention dictates that a value must be passed 653 // by reference, and so we need a stacked temporary to hold 654 // a copy of that value) 655 void ExpandInputsLoopInvariant(llvm::IRBuilder<> &Builder, llvm::BasicBlock *LoopHeader, 656 llvm::Value *Arg_p, 657 llvm::MDNode *TBAAPointer, 658 llvm::Function::arg_iterator ArgIter, 659 const size_t NumInputs, 660 llvm::SmallVectorImpl<llvm::Value *> &InBufPtrs, 661 llvm::SmallVectorImpl<llvm::Value *> &InStructTempSlots) { 662 bccAssert(NumInputs <= RS_KERNEL_INPUT_LIMIT); 663 664 // Extract information about input slots. The work done 665 // here is loop-invariant, so we can hoist the operations out of the loop. 666 auto OldInsertionPoint = Builder.saveIP(); 667 Builder.SetInsertPoint(LoopHeader->getTerminator()); 668 669 for (size_t InputIndex = 0; InputIndex < NumInputs; ++InputIndex, ArgIter++) { 670 llvm::Type *InType = ArgIter->getType(); 671 672 /* 673 * AArch64 calling conventions dictate that structs of sufficient size 674 * get passed by pointer instead of passed by value. This, combined 675 * with the fact that we don't allow kernels to operate on pointer 676 * data means that if we see a kernel with a pointer parameter we know 677 * that it is a struct input that has been promoted. As such we don't 678 * need to convert its type to a pointer. Later we will need to know 679 * to create a temporary copy on the stack, so we save this information 680 * in InStructTempSlots. 681 */ 682 if (auto PtrType = llvm::dyn_cast<llvm::PointerType>(InType)) { 683 llvm::Type *ElementType = PtrType->getElementType(); 684 InStructTempSlots.push_back(Builder.CreateAlloca(ElementType, nullptr, 685 "input_struct_slot")); 686 } else { 687 InType = InType->getPointerTo(); 688 InStructTempSlots.push_back(nullptr); 689 } 690 691 SmallGEPIndices InBufPtrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr, 692 static_cast<int32_t>(InputIndex)})); 693 llvm::Value *InBufPtrAddr = Builder.CreateInBoundsGEP(Arg_p, InBufPtrGEP, "input_buf.gep"); 694 llvm::LoadInst *InBufPtr = Builder.CreateLoad(InBufPtrAddr, "input_buf"); 695 llvm::Value *CastInBufPtr = Builder.CreatePointerCast(InBufPtr, InType, "casted_in"); 696 697 if (gEnableRsTbaa) { 698 InBufPtr->setMetadata("tbaa", TBAAPointer); 699 } 700 701 InBufPtrs.push_back(CastInBufPtr); 702 } 703 704 Builder.restoreIP(OldInsertionPoint); 705 } 706 707 // Generate loop-varying input processing code for an expanded ForEach-able function 708 // or an expanded general reduction accumulator function. Also, for the call to the 709 // UNexpanded function, collect the portion of the argument list corresponding to the 710 // inputs. 711 // 712 // Arg_x1 - first X coordinate to be processed by the expanded function 713 // TBAAAllocation - metadata for marking loads of input values out of allocations 714 // NumInputs -- number of inputs (NOT number of ARGUMENTS) 715 // InBufPtrs[] - this function consumes the information produced by ExpandInputsLoopInvariant() 716 // InStructTempSlots[] - this function consumes the information produced by ExpandInputsLoopInvariant() 717 // IndVar - value of loop induction variable (X coordinate) for a given loop iteration 718 // 719 // RootArgs - this function sets this to the list of outgoing argument values corresponding 720 // to the inputs 721 void ExpandInputsBody(llvm::IRBuilder<> &Builder, 722 llvm::Value *Arg_x1, 723 llvm::MDNode *TBAAAllocation, 724 const size_t NumInputs, 725 const llvm::SmallVectorImpl<llvm::Value *> &InBufPtrs, 726 const llvm::SmallVectorImpl<llvm::Value *> &InStructTempSlots, 727 llvm::Value *IndVar, 728 llvm::SmallVectorImpl<llvm::Value *> &RootArgs) { 729 llvm::Value *Offset = Builder.CreateSub(IndVar, Arg_x1); 730 731 for (size_t Index = 0; Index < NumInputs; ++Index) { 732 llvm::Value *InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], Offset); 733 llvm::Value *Input; 734 735 llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input"); 736 737 if (gEnableRsTbaa) { 738 InputLoad->setMetadata("tbaa", TBAAAllocation); 739 } 740 741 if (llvm::Value *TemporarySlot = InStructTempSlots[Index]) { 742 // Pass a pointer to a temporary on the stack, rather than 743 // passing a pointer to the original value. We do not want 744 // the kernel to potentially modify the input data. 745 746 // Note: don't annotate with TBAA, since the kernel might 747 // have its own TBAA annotations for the pointer argument. 748 Builder.CreateStore(InputLoad, TemporarySlot); 749 Input = TemporarySlot; 750 } else { 751 Input = InputLoad; 752 } 753 754 RootArgs.push_back(Input); 755 } 756 } 757 758 /* Performs the actual optimization on a selected function. On success, the 759 * Module will contain a new function of the name "<NAME>.expand" that 760 * invokes <NAME>() in a loop with the appropriate parameters. 761 */ 762 bool ExpandOldStyleForEach(llvm::Function *Function, uint32_t Signature) { 763 ALOGV("Expanding ForEach-able Function %s", 764 Function->getName().str().c_str()); 765 766 if (!Signature) { 767 Signature = getRootSignature(Function); 768 if (!Signature) { 769 // We couldn't determine how to expand this function based on its 770 // function signature. 771 return false; 772 } 773 } 774 775 llvm::DataLayout DL(Module); 776 777 llvm::Function *ExpandedFunction = 778 createEmptyExpandedForEachKernel(Function->getName()); 779 780 /* 781 * Extract the expanded function's parameters. It is guaranteed by 782 * createEmptyExpandedForEachKernel that there will be four parameters. 783 */ 784 785 bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams); 786 787 llvm::Function::arg_iterator ExpandedFunctionArgIter = 788 ExpandedFunction->arg_begin(); 789 790 llvm::Value *Arg_p = &*(ExpandedFunctionArgIter++); 791 llvm::Value *Arg_x1 = &*(ExpandedFunctionArgIter++); 792 llvm::Value *Arg_x2 = &*(ExpandedFunctionArgIter++); 793 llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter); 794 795 llvm::Value *InStep = nullptr; 796 llvm::Value *OutStep = nullptr; 797 798 // Construct the actual function body. 799 llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin()); 800 801 // Collect and construct the arguments for the kernel(). 802 // Note that we load any loop-invariant arguments before entering the Loop. 803 llvm::Function::arg_iterator FunctionArgIter = Function->arg_begin(); 804 805 llvm::Type *InTy = nullptr; 806 llvm::Value *InBufPtr = nullptr; 807 if (bcinfo::MetadataExtractor::hasForEachSignatureIn(Signature)) { 808 SmallGEPIndices InStepGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInStride, 0})); 809 llvm::LoadInst *InStepArg = Builder.CreateLoad( 810 Builder.CreateInBoundsGEP(Arg_p, InStepGEP, "instep_addr.gep"), "instep_addr"); 811 812 InTy = (FunctionArgIter++)->getType(); 813 InStep = getStepValue(&DL, InTy, InStepArg); 814 815 InStep->setName("instep"); 816 817 SmallGEPIndices InputAddrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr, 0})); 818 InBufPtr = Builder.CreateLoad( 819 Builder.CreateInBoundsGEP(Arg_p, InputAddrGEP, "input_buf.gep"), "input_buf"); 820 } 821 822 llvm::Type *OutTy = nullptr; 823 llvm::Value *OutBasePtr = nullptr; 824 if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) { 825 OutTy = (FunctionArgIter++)->getType(); 826 OutStep = getStepValue(&DL, OutTy, Arg_outstep); 827 OutStep->setName("outstep"); 828 SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0})); 829 OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep")); 830 } 831 832 llvm::Value *UsrData = nullptr; 833 if (bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)) { 834 llvm::Type *UsrDataTy = (FunctionArgIter++)->getType(); 835 llvm::Value *UsrDataPointerAddr = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldUsr); 836 UsrData = Builder.CreatePointerCast(Builder.CreateLoad(UsrDataPointerAddr), UsrDataTy); 837 UsrData->setName("UsrData"); 838 } 839 840 llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock(); 841 llvm::Value *IV; 842 createLoop(Builder, Arg_x1, Arg_x2, &IV); 843 844 llvm::SmallVector<llvm::Value*, 8> CalleeArgs; 845 const int CalleeArgsContextIdx = ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs, 846 [&FunctionArgIter]() { FunctionArgIter++; }, 847 LoopHeader->getTerminator()); 848 849 bccAssert(FunctionArgIter == Function->arg_end()); 850 851 // Populate the actual call to kernel(). 852 llvm::SmallVector<llvm::Value*, 8> RootArgs; 853 854 llvm::Value *InPtr = nullptr; 855 llvm::Value *OutPtr = nullptr; 856 857 // Calculate the current input and output pointers 858 // 859 // We always calculate the input/output pointers with a GEP operating on i8 860 // values and only cast at the very end to OutTy. This is because the step 861 // between two values is given in bytes. 862 // 863 // TODO: We could further optimize the output by using a GEP operation of 864 // type 'OutTy' in cases where the element type of the allocation allows. 865 if (OutBasePtr) { 866 llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1); 867 OutOffset = Builder.CreateMul(OutOffset, OutStep); 868 OutPtr = Builder.CreateInBoundsGEP(OutBasePtr, OutOffset); 869 OutPtr = Builder.CreatePointerCast(OutPtr, OutTy); 870 } 871 872 if (InBufPtr) { 873 llvm::Value *InOffset = Builder.CreateSub(IV, Arg_x1); 874 InOffset = Builder.CreateMul(InOffset, InStep); 875 InPtr = Builder.CreateInBoundsGEP(InBufPtr, InOffset); 876 InPtr = Builder.CreatePointerCast(InPtr, InTy); 877 } 878 879 if (InPtr) { 880 RootArgs.push_back(InPtr); 881 } 882 883 if (OutPtr) { 884 RootArgs.push_back(OutPtr); 885 } 886 887 if (UsrData) { 888 RootArgs.push_back(UsrData); 889 } 890 891 finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder); 892 893 Builder.CreateCall(Function, RootArgs); 894 895 return true; 896 } 897 898 /* Expand a pass-by-value foreach kernel. 899 */ 900 bool ExpandForEach(llvm::Function *Function, uint32_t Signature) { 901 bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)); 902 ALOGV("Expanding kernel Function %s", Function->getName().str().c_str()); 903 904 // TODO: Refactor this to share functionality with ExpandOldStyleForEach. 905 llvm::DataLayout DL(Module); 906 907 llvm::Function *ExpandedFunction = 908 createEmptyExpandedForEachKernel(Function->getName()); 909 910 /* 911 * Extract the expanded function's parameters. It is guaranteed by 912 * createEmptyExpandedForEachKernel that there will be four parameters. 913 */ 914 915 bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams); 916 917 llvm::Function::arg_iterator ExpandedFunctionArgIter = 918 ExpandedFunction->arg_begin(); 919 920 llvm::Value *Arg_p = &*(ExpandedFunctionArgIter++); 921 llvm::Value *Arg_x1 = &*(ExpandedFunctionArgIter++); 922 llvm::Value *Arg_x2 = &*(ExpandedFunctionArgIter++); 923 // Arg_outstep is not used by expanded new-style forEach kernels. 924 925 // Construct the actual function body. 926 llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin()); 927 928 // Create TBAA meta-data. 929 llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, 930 *TBAAAllocation, *TBAAPointer; 931 llvm::MDBuilder MDHelper(*Context); 932 933 TBAARenderScriptDistinct = 934 MDHelper.createTBAARoot(kRenderScriptTBAARootName); 935 TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName, 936 TBAARenderScriptDistinct); 937 TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation", 938 TBAARenderScript); 939 TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation, 940 TBAAAllocation, 0); 941 TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer", 942 TBAARenderScript); 943 TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0); 944 945 /* 946 * Collect and construct the arguments for the kernel(). 947 * 948 * Note that we load any loop-invariant arguments before entering the Loop. 949 */ 950 size_t NumRemainingInputs = Function->arg_size(); 951 952 // No usrData parameter on kernels. 953 bccAssert( 954 !bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)); 955 956 llvm::Function::arg_iterator ArgIter = Function->arg_begin(); 957 958 // Check the return type 959 llvm::Type *OutTy = nullptr; 960 llvm::LoadInst *OutBasePtr = nullptr; 961 llvm::Value *CastedOutBasePtr = nullptr; 962 963 bool PassOutByPointer = false; 964 965 if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) { 966 llvm::Type *OutBaseTy = Function->getReturnType(); 967 968 if (OutBaseTy->isVoidTy()) { 969 PassOutByPointer = true; 970 OutTy = ArgIter->getType(); 971 972 ArgIter++; 973 --NumRemainingInputs; 974 } else { 975 // We don't increment Args, since we are using the actual return type. 976 OutTy = OutBaseTy->getPointerTo(); 977 } 978 979 SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0})); 980 OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep")); 981 982 if (gEnableRsTbaa) { 983 OutBasePtr->setMetadata("tbaa", TBAAPointer); 984 } 985 986 CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out"); 987 } 988 989 llvm::SmallVector<llvm::Value*, 8> InBufPtrs; 990 llvm::SmallVector<llvm::Value*, 8> InStructTempSlots; 991 992 bccAssert(NumRemainingInputs <= RS_KERNEL_INPUT_LIMIT); 993 994 // Create the loop structure. 995 llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock(); 996 llvm::Value *IV; 997 createLoop(Builder, Arg_x1, Arg_x2, &IV); 998 999 llvm::SmallVector<llvm::Value*, 8> CalleeArgs; 1000 const int CalleeArgsContextIdx = 1001 ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs, 1002 [&NumRemainingInputs]() { --NumRemainingInputs; }, 1003 LoopHeader->getTerminator()); 1004 1005 // After ExpandSpecialArguments() gets called, NumRemainingInputs 1006 // counts the number of arguments to the kernel that correspond to 1007 // an array entry from the InPtr field of the DriverInfo 1008 // structure. 1009 const size_t NumInPtrArguments = NumRemainingInputs; 1010 1011 if (NumInPtrArguments > 0) { 1012 ExpandInputsLoopInvariant(Builder, LoopHeader, Arg_p, TBAAPointer, ArgIter, NumInPtrArguments, 1013 InBufPtrs, InStructTempSlots); 1014 } 1015 1016 // Populate the actual call to kernel(). 1017 llvm::SmallVector<llvm::Value*, 8> RootArgs; 1018 1019 // Calculate the current input and output pointers. 1020 1021 // Output 1022 1023 llvm::Value *OutPtr = nullptr; 1024 if (CastedOutBasePtr) { 1025 llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1); 1026 OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffset); 1027 1028 if (PassOutByPointer) { 1029 RootArgs.push_back(OutPtr); 1030 } 1031 } 1032 1033 // Inputs 1034 1035 if (NumInPtrArguments > 0) { 1036 ExpandInputsBody(Builder, Arg_x1, TBAAAllocation, NumInPtrArguments, 1037 InBufPtrs, InStructTempSlots, IV, RootArgs); 1038 } 1039 1040 finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder); 1041 1042 llvm::Value *RetVal = Builder.CreateCall(Function, RootArgs); 1043 1044 if (OutPtr && !PassOutByPointer) { 1045 RetVal->setName("call.result"); 1046 llvm::StoreInst *Store = Builder.CreateStore(RetVal, OutPtr); 1047 if (gEnableRsTbaa) { 1048 Store->setMetadata("tbaa", TBAAAllocation); 1049 } 1050 } 1051 1052 return true; 1053 } 1054 1055 // Expand a simple reduce-style kernel function. 1056 // 1057 // The input is a kernel which represents a binary operation, 1058 // of the form 1059 // 1060 // define foo @func(foo %a, foo %b), 1061 // 1062 // (More generally, it can be of the forms 1063 // 1064 // define void @func(foo* %ret, foo* %a, foo* %b) 1065 // define void @func(foo* %ret, foo1 %a, foo1 %b) 1066 // define foo1 @func(foo2 %a, foo2 %b) 1067 // 1068 // as a result of argument / return value conversions. Here, "foo1" 1069 // and "foo2" refer to possibly coerced types, and the coerced 1070 // argument type may be different from the coerced return type. See 1071 // "Note on coercion" below.) 1072 // 1073 // Note also, we do not expect to encounter any case when the 1074 // arguments are promoted to pointers but the return value is 1075 // unpromoted to pointer, e.g. 1076 // 1077 // define foo1 @func(foo* %a, foo* %b) 1078 // 1079 // and we will throw an assertion in this case.) 1080 // 1081 // The input kernel gets expanded into a kernel of the form 1082 // 1083 // define void @func.expand(i8* %inBuf, i8* outBuf, i32 len) 1084 // 1085 // which performs a serial reduction of `len` elements from `inBuf`, 1086 // and stores the result into `outBuf`. In pseudocode, @func.expand 1087 // does: 1088 // 1089 // inArr := (foo *)inBuf; 1090 // accum := inArr[0]; 1091 // for (i := 1; i < len; ++i) { 1092 // accum := foo(accum, inArr[i]); 1093 // } 1094 // *(foo *)outBuf := accum; 1095 // 1096 // Note on coercion 1097 // 1098 // Both the return value and the argument types may undergo internal 1099 // coercion in clang as part of call lowering. As a result, the 1100 // return value type may differ from the argument type even if the 1101 // types in the RenderScript signaure are the same. For instance, the 1102 // kernel 1103 // 1104 // int3 add(int3 a, int3 b) { return a + b; } 1105 // 1106 // gets lowered by clang as 1107 // 1108 // define <3 x i32> @add(<4 x i32> %a.coerce, <4 x i32> %b.coerce) 1109 // 1110 // under AArch64. The details of this process are found in clang, 1111 // lib/CodeGen/TargetInfo.cpp, under classifyArgumentType() and 1112 // classifyReturnType() in ARMABIInfo, AArch64ABIInfo. If the value 1113 // is passed by pointer, then the pointed-to type is not coerced. 1114 // 1115 // Since we lack the original type information, this code does loads 1116 // and stores of allocation data by way of pointers to the coerced 1117 // type. 1118 bool ExpandReduce(llvm::Function *Function) { 1119 bccAssert(Function); 1120 1121 ALOGV("Expanding simple reduce kernel %s", Function->getName().str().c_str()); 1122 1123 llvm::DataLayout DL(Module); 1124 1125 // TBAA Metadata 1126 llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, *TBAAAllocation; 1127 llvm::MDBuilder MDHelper(*Context); 1128 1129 TBAARenderScriptDistinct = 1130 MDHelper.createTBAARoot(kRenderScriptTBAARootName); 1131 TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName, 1132 TBAARenderScriptDistinct); 1133 TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation", 1134 TBAARenderScript); 1135 TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation, 1136 TBAAAllocation, 0); 1137 1138 llvm::Function *ExpandedFunction = 1139 createEmptyExpandedReduceKernel(Function->getName()); 1140 1141 // Extract the expanded kernel's parameters. It is guaranteed by 1142 // createEmptyExpandedReduceKernel that there will be 3 parameters. 1143 auto ExpandedFunctionArgIter = ExpandedFunction->arg_begin(); 1144 1145 llvm::Value *Arg_inBuf = &*(ExpandedFunctionArgIter++); 1146 llvm::Value *Arg_outBuf = &*(ExpandedFunctionArgIter++); 1147 llvm::Value *Arg_len = &*(ExpandedFunctionArgIter++); 1148 1149 bccAssert(Function->arg_size() == 2 || Function->arg_size() == 3); 1150 1151 // Check if, instead of returning a value, the original kernel has 1152 // a pointer parameter which points to a temporary buffer into 1153 // which the return value gets written. 1154 const bool ReturnValuePointerStyle = (Function->arg_size() == 3); 1155 bccAssert(Function->getReturnType()->isVoidTy() == ReturnValuePointerStyle); 1156 1157 // Check if, instead of being passed by value, the inputs to the 1158 // original kernel are passed by pointer. 1159 auto FirstArgIter = Function->arg_begin(); 1160 // The second argument is always an input to the original kernel. 1161 auto SecondArgIter = std::next(FirstArgIter); 1162 const bool InputsPointerStyle = SecondArgIter->getType()->isPointerTy(); 1163 1164 // Get the output type (i.e. return type of the original kernel). 1165 llvm::PointerType *OutPtrTy = nullptr; 1166 llvm::Type *OutTy = nullptr; 1167 if (ReturnValuePointerStyle) { 1168 OutPtrTy = llvm::dyn_cast<llvm::PointerType>(FirstArgIter->getType()); 1169 bccAssert(OutPtrTy && "Expected a pointer parameter to kernel"); 1170 OutTy = OutPtrTy->getElementType(); 1171 } else { 1172 OutTy = Function->getReturnType(); 1173 bccAssert(!OutTy->isVoidTy()); 1174 OutPtrTy = OutTy->getPointerTo(); 1175 } 1176 1177 // Get the input type (type of the arguments to the original 1178 // kernel). Some input types are different from the output type, 1179 // due to explicit coercion that the compiler performs when 1180 // lowering the parameters. See "Note on coercion" above. 1181 llvm::PointerType *InPtrTy; 1182 llvm::Type *InTy; 1183 if (InputsPointerStyle) { 1184 InPtrTy = llvm::dyn_cast<llvm::PointerType>(SecondArgIter->getType()); 1185 bccAssert(InPtrTy && "Expected a pointer parameter to kernel"); 1186 bccAssert(ReturnValuePointerStyle); 1187 bccAssert(std::next(SecondArgIter)->getType() == InPtrTy && 1188 "Input type mismatch"); 1189 InTy = InPtrTy->getElementType(); 1190 } else { 1191 InTy = SecondArgIter->getType(); 1192 InPtrTy = InTy->getPointerTo(); 1193 if (!ReturnValuePointerStyle) { 1194 bccAssert(InTy == FirstArgIter->getType() && "Input type mismatch"); 1195 } else { 1196 bccAssert(InTy == std::next(SecondArgIter)->getType() && 1197 "Input type mismatch"); 1198 } 1199 } 1200 1201 // The input type should take up the same amount of space in 1202 // memory as the output type. 1203 bccAssert(DL.getTypeAllocSize(InTy) == DL.getTypeAllocSize(OutTy)); 1204 1205 // Construct the actual function body. 1206 llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin()); 1207 1208 // Cast input and output buffers to appropriate types. 1209 llvm::Value *InBuf = Builder.CreatePointerCast(Arg_inBuf, InPtrTy); 1210 llvm::Value *OutBuf = Builder.CreatePointerCast(Arg_outBuf, OutPtrTy); 1211 1212 // Create a slot to pass temporary results back. This needs to be 1213 // separate from the accumulator slot because the kernel may mark 1214 // the return value slot as noalias. 1215 llvm::Value *ReturnBuf = nullptr; 1216 if (ReturnValuePointerStyle) { 1217 ReturnBuf = Builder.CreateAlloca(OutTy, nullptr, "ret.tmp"); 1218 } 1219 1220 // Create a slot to hold the second input if the inputs are passed 1221 // by pointer to the original kernel. We cannot directly pass a 1222 // pointer to the input buffer, because the kernel may modify its 1223 // inputs. 1224 llvm::Value *SecondInputTempBuf = nullptr; 1225 if (InputsPointerStyle) { 1226 SecondInputTempBuf = Builder.CreateAlloca(InTy, nullptr, "in.tmp"); 1227 } 1228 1229 // Create a slot to accumulate temporary results, and fill it with 1230 // the first value. 1231 llvm::Value *AccumBuf = Builder.CreateAlloca(OutTy, nullptr, "accum"); 1232 // Cast to OutPtrTy before loading, since AccumBuf has type OutPtrTy. 1233 llvm::LoadInst *FirstElementLoad = Builder.CreateLoad( 1234 Builder.CreatePointerCast(InBuf, OutPtrTy)); 1235 if (gEnableRsTbaa) { 1236 FirstElementLoad->setMetadata("tbaa", TBAAAllocation); 1237 } 1238 // Memory operations with AccumBuf shouldn't be marked with 1239 // RenderScript TBAA, since this might conflict with TBAA metadata 1240 // in the kernel function when AccumBuf is passed by pointer. 1241 Builder.CreateStore(FirstElementLoad, AccumBuf); 1242 1243 // Loop body 1244 1245 // Create the loop structure. Note that the first input in the input buffer 1246 // has already been accumulated, so that we start at index 1. 1247 llvm::Value *IndVar; 1248 llvm::Value *Start = llvm::ConstantInt::get(Arg_len->getType(), 1); 1249 llvm::BasicBlock *Exit = createLoop(Builder, Start, Arg_len, &IndVar); 1250 1251 llvm::Value *InputPtr = Builder.CreateInBoundsGEP(InBuf, IndVar, "next_input.gep"); 1252 1253 // Set up arguments and call the original (unexpanded) kernel. 1254 // 1255 // The original kernel can have at most 3 arguments, which is 1256 // achieved when the signature looks like: 1257 // 1258 // define void @func(foo* %ret, bar %a, bar %b) 1259 // 1260 // (bar can be one of foo/foo.coerce/foo*). 1261 llvm::SmallVector<llvm::Value *, 3> KernelArgs; 1262 1263 if (ReturnValuePointerStyle) { 1264 KernelArgs.push_back(ReturnBuf); 1265 } 1266 1267 if (InputsPointerStyle) { 1268 bccAssert(ReturnValuePointerStyle); 1269 // Because the return buffer is copied back into the 1270 // accumulator, it's okay if the accumulator is overwritten. 1271 KernelArgs.push_back(AccumBuf); 1272 1273 llvm::LoadInst *InputLoad = Builder.CreateLoad(InputPtr); 1274 if (gEnableRsTbaa) { 1275 InputLoad->setMetadata("tbaa", TBAAAllocation); 1276 } 1277 Builder.CreateStore(InputLoad, SecondInputTempBuf); 1278 1279 KernelArgs.push_back(SecondInputTempBuf); 1280 } else { 1281 // InPtrTy may be different from OutPtrTy (the type of 1282 // AccumBuf), so first cast the accumulator buffer to the 1283 // pointer type corresponding to the input argument type. 1284 KernelArgs.push_back( 1285 Builder.CreateLoad(Builder.CreatePointerCast(AccumBuf, InPtrTy))); 1286 1287 llvm::LoadInst *LoadedArg = Builder.CreateLoad(InputPtr); 1288 if (gEnableRsTbaa) { 1289 LoadedArg->setMetadata("tbaa", TBAAAllocation); 1290 } 1291 KernelArgs.push_back(LoadedArg); 1292 } 1293 1294 llvm::Value *RetVal = Builder.CreateCall(Function, KernelArgs); 1295 1296 const uint64_t ElementSize = DL.getTypeStoreSize(OutTy); 1297 const uint64_t ElementAlign = DL.getABITypeAlignment(OutTy); 1298 1299 // Store the output in the accumulator. 1300 if (ReturnValuePointerStyle) { 1301 Builder.CreateMemCpy(AccumBuf, ReturnBuf, ElementSize, ElementAlign); 1302 } else { 1303 Builder.CreateStore(RetVal, AccumBuf); 1304 } 1305 1306 // Loop exit 1307 Builder.SetInsertPoint(Exit, Exit->begin()); 1308 1309 llvm::LoadInst *OutputLoad = Builder.CreateLoad(AccumBuf); 1310 llvm::StoreInst *OutputStore = Builder.CreateStore(OutputLoad, OutBuf); 1311 if (gEnableRsTbaa) { 1312 OutputStore->setMetadata("tbaa", TBAAAllocation); 1313 } 1314 1315 return true; 1316 } 1317 1318 // Certain categories of functions that make up a general 1319 // reduce-style kernel are called directly from the driver with no 1320 // expansion needed. For a function in such a category, we need to 1321 // promote linkage from static to external, to ensure that the 1322 // function is visible to the driver in the dynamic symbol table. 1323 // This promotion is safe because we don't have any kind of cross 1324 // translation unit linkage model (except for linking against 1325 // RenderScript libraries), so we do not risk name clashes. 1326 bool PromoteReduceNewFunction(const char *Name, FunctionSet &PromotedFunctions) { 1327 if (!Name) // a presumably-optional function that is not present 1328 return false; 1329 1330 llvm::Function *Fn = Module->getFunction(Name); 1331 bccAssert(Fn != nullptr); 1332 if (PromotedFunctions.insert(Fn).second) { 1333 bccAssert(Fn->getLinkage() == llvm::GlobalValue::InternalLinkage); 1334 Fn->setLinkage(llvm::GlobalValue::ExternalLinkage); 1335 return true; 1336 } 1337 1338 return false; 1339 } 1340 1341 // Expand the accumulator function for a general reduce-style kernel. 1342 // 1343 // The input is a function of the form 1344 // 1345 // define void @func(accumType* %accum, foo1 in1[, ... fooN inN] [, special arguments]) 1346 // 1347 // where all arguments except the first are the same as for a foreach kernel. 1348 // 1349 // The input accumulator function gets expanded into a function of the form 1350 // 1351 // define void @func.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, accumType* %accum) 1352 // 1353 // which performs a serial accumulaion of elements [x1, x2) into *%accum. 1354 // 1355 // In pseudocode, @func.expand does: 1356 // 1357 // for (i = %x1; i < %x2; ++i) { 1358 // func(%accum, 1359 // *((foo1 *)p->inPtr[0] + i)[, ... *((fooN *)p->inPtr[N-1] + i) 1360 // [, p] [, i] [, p->current.y] [, p->current.z]); 1361 // } 1362 // 1363 // This is very similar to foreach kernel expansion with no output. 1364 bool ExpandReduceNewAccumulator(llvm::Function *FnAccumulator, uint32_t Signature, size_t NumInputs) { 1365 ALOGV("Expanding accumulator %s for general reduce kernel", 1366 FnAccumulator->getName().str().c_str()); 1367 1368 // Create TBAA meta-data. 1369 llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, 1370 *TBAAAllocation, *TBAAPointer; 1371 llvm::MDBuilder MDHelper(*Context); 1372 TBAARenderScriptDistinct = 1373 MDHelper.createTBAARoot(kRenderScriptTBAARootName); 1374 TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName, 1375 TBAARenderScriptDistinct); 1376 TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation", 1377 TBAARenderScript); 1378 TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation, 1379 TBAAAllocation, 0); 1380 TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer", 1381 TBAARenderScript); 1382 TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0); 1383 1384 auto AccumulatorArgIter = FnAccumulator->arg_begin(); 1385 1386 // Create empty accumulator function. 1387 llvm::Function *FnExpandedAccumulator = 1388 createEmptyExpandedReduceNewAccumulator(FnAccumulator->getName(), 1389 (AccumulatorArgIter++)->getType()); 1390 1391 // Extract the expanded accumulator's parameters. It is 1392 // guaranteed by createEmptyExpandedReduceNewAccumulator that 1393 // there will be 4 parameters. 1394 bccAssert(FnExpandedAccumulator->arg_size() == kNumExpandedReduceNewAccumulatorParams); 1395 auto ExpandedAccumulatorArgIter = FnExpandedAccumulator->arg_begin(); 1396 llvm::Value *Arg_p = &*(ExpandedAccumulatorArgIter++); 1397 llvm::Value *Arg_x1 = &*(ExpandedAccumulatorArgIter++); 1398 llvm::Value *Arg_x2 = &*(ExpandedAccumulatorArgIter++); 1399 llvm::Value *Arg_accum = &*(ExpandedAccumulatorArgIter++); 1400 1401 // Construct the actual function body. 1402 llvm::IRBuilder<> Builder(FnExpandedAccumulator->getEntryBlock().begin()); 1403 1404 // Create the loop structure. 1405 llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock(); 1406 llvm::Value *IndVar; 1407 createLoop(Builder, Arg_x1, Arg_x2, &IndVar); 1408 1409 llvm::SmallVector<llvm::Value*, 8> CalleeArgs; 1410 const int CalleeArgsContextIdx = 1411 ExpandSpecialArguments(Signature, IndVar, Arg_p, Builder, CalleeArgs, 1412 [](){}, LoopHeader->getTerminator()); 1413 1414 llvm::SmallVector<llvm::Value*, 8> InBufPtrs; 1415 llvm::SmallVector<llvm::Value*, 8> InStructTempSlots; 1416 ExpandInputsLoopInvariant(Builder, LoopHeader, Arg_p, TBAAPointer, AccumulatorArgIter, NumInputs, 1417 InBufPtrs, InStructTempSlots); 1418 1419 // Populate the actual call to the original accumulator. 1420 llvm::SmallVector<llvm::Value*, 8> RootArgs; 1421 RootArgs.push_back(Arg_accum); 1422 ExpandInputsBody(Builder, Arg_x1, TBAAAllocation, NumInputs, InBufPtrs, InStructTempSlots, 1423 IndVar, RootArgs); 1424 finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *FnAccumulator, Builder); 1425 Builder.CreateCall(FnAccumulator, RootArgs); 1426 1427 return true; 1428 } 1429 1430 /// @brief Checks if pointers to allocation internals are exposed 1431 /// 1432 /// This function verifies if through the parameters passed to the kernel 1433 /// or through calls to the runtime library the script gains access to 1434 /// pointers pointing to data within a RenderScript Allocation. 1435 /// If we know we control all loads from and stores to data within 1436 /// RenderScript allocations and if we know the run-time internal accesses 1437 /// are all annotated with RenderScript TBAA metadata, only then we 1438 /// can safely use TBAA to distinguish between generic and from-allocation 1439 /// pointers. 1440 bool allocPointersExposed(llvm::Module &Module) { 1441 // Old style kernel function can expose pointers to elements within 1442 // allocations. 1443 // TODO: Extend analysis to allow simple cases of old-style kernels. 1444 for (size_t i = 0; i < mExportForEachCount; ++i) { 1445 const char *Name = mExportForEachNameList[i]; 1446 uint32_t Signature = mExportForEachSignatureList[i]; 1447 if (Module.getFunction(Name) && 1448 !bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)) { 1449 return true; 1450 } 1451 } 1452 1453 // Check for library functions that expose a pointer to an Allocation or 1454 // that are not yet annotated with RenderScript-specific tbaa information. 1455 static const std::vector<const char *> Funcs{ 1456 // rsGetElementAt(...) 1457 "_Z14rsGetElementAt13rs_allocationj", 1458 "_Z14rsGetElementAt13rs_allocationjj", 1459 "_Z14rsGetElementAt13rs_allocationjjj", 1460 1461 // rsSetElementAt() 1462 "_Z14rsSetElementAt13rs_allocationPvj", 1463 "_Z14rsSetElementAt13rs_allocationPvjj", 1464 "_Z14rsSetElementAt13rs_allocationPvjjj", 1465 1466 // rsGetElementAtYuv_uchar_Y() 1467 "_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj", 1468 1469 // rsGetElementAtYuv_uchar_U() 1470 "_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj", 1471 1472 // rsGetElementAtYuv_uchar_V() 1473 "_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj", 1474 }; 1475 1476 for (auto FI : Funcs) { 1477 llvm::Function *Function = Module.getFunction(FI); 1478 1479 if (!Function) { 1480 ALOGE("Missing run-time function '%s'", FI); 1481 return true; 1482 } 1483 1484 if (Function->getNumUses() > 0) { 1485 return true; 1486 } 1487 } 1488 1489 return false; 1490 } 1491 1492 /// @brief Connect RenderScript TBAA metadata to C/C++ metadata 1493 /// 1494 /// The TBAA metadata used to annotate loads/stores from RenderScript 1495 /// Allocations is generated in a separate TBAA tree with a 1496 /// "RenderScript Distinct TBAA" root node. LLVM does assume may-alias for 1497 /// all nodes in unrelated alias analysis trees. This function makes the 1498 /// "RenderScript TBAA" node (which is parented by the Distinct TBAA root), 1499 /// a subtree of the normal C/C++ TBAA tree aside of normal C/C++ types. With 1500 /// the connected trees every access to an Allocation is resolved to 1501 /// must-alias if compared to a normal C/C++ access. 1502 void connectRenderScriptTBAAMetadata(llvm::Module &Module) { 1503 llvm::MDBuilder MDHelper(*Context); 1504 llvm::MDNode *TBAARenderScriptDistinct = 1505 MDHelper.createTBAARoot("RenderScript Distinct TBAA"); 1506 llvm::MDNode *TBAARenderScript = MDHelper.createTBAANode( 1507 "RenderScript TBAA", TBAARenderScriptDistinct); 1508 llvm::MDNode *TBAARoot = MDHelper.createTBAARoot("Simple C/C++ TBAA"); 1509 TBAARenderScript->replaceOperandWith(1, TBAARoot); 1510 } 1511 1512 virtual bool runOnModule(llvm::Module &Module) { 1513 bool Changed = false; 1514 this->Module = &Module; 1515 Context = &Module.getContext(); 1516 1517 buildTypes(); 1518 1519 bcinfo::MetadataExtractor me(&Module); 1520 if (!me.extract()) { 1521 ALOGE("Could not extract metadata from module!"); 1522 return false; 1523 } 1524 1525 // Expand forEach_* style kernels. 1526 mExportForEachCount = me.getExportForEachSignatureCount(); 1527 mExportForEachNameList = me.getExportForEachNameList(); 1528 mExportForEachSignatureList = me.getExportForEachSignatureList(); 1529 1530 for (size_t i = 0; i < mExportForEachCount; ++i) { 1531 const char *name = mExportForEachNameList[i]; 1532 uint32_t signature = mExportForEachSignatureList[i]; 1533 llvm::Function *kernel = Module.getFunction(name); 1534 if (kernel) { 1535 if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) { 1536 Changed |= ExpandForEach(kernel, signature); 1537 kernel->setLinkage(llvm::GlobalValue::InternalLinkage); 1538 } else if (kernel->getReturnType()->isVoidTy()) { 1539 Changed |= ExpandOldStyleForEach(kernel, signature); 1540 kernel->setLinkage(llvm::GlobalValue::InternalLinkage); 1541 } else { 1542 // There are some graphics root functions that are not 1543 // expanded, but that will be called directly. For those 1544 // functions, we can not set the linkage to internal. 1545 } 1546 } 1547 } 1548 1549 // Expand simple reduce_* style kernels. 1550 mExportReduceCount = me.getExportReduceCount(); 1551 mExportReduceNameList = me.getExportReduceNameList(); 1552 1553 for (size_t i = 0; i < mExportReduceCount; ++i) { 1554 llvm::Function *kernel = Module.getFunction(mExportReduceNameList[i]); 1555 if (kernel) { 1556 Changed |= ExpandReduce(kernel); 1557 } 1558 } 1559 1560 // Process general reduce_* style functions. 1561 const size_t ExportReduceNewCount = me.getExportReduceNewCount(); 1562 const bcinfo::MetadataExtractor::ReduceNew *ExportReduceNewList = me.getExportReduceNewList(); 1563 // Note that functions can be shared between kernels 1564 FunctionSet PromotedFunctions, ExpandedAccumulators; 1565 1566 for (size_t i = 0; i < ExportReduceNewCount; ++i) { 1567 Changed |= PromoteReduceNewFunction(ExportReduceNewList[i].mInitializerName, PromotedFunctions); 1568 Changed |= PromoteReduceNewFunction(ExportReduceNewList[i].mOutConverterName, PromotedFunctions); 1569 1570 // Accumulator 1571 llvm::Function *accumulator = Module.getFunction(ExportReduceNewList[i].mAccumulatorName); 1572 bccAssert(accumulator != nullptr); 1573 if (ExpandedAccumulators.insert(accumulator).second) 1574 Changed |= ExpandReduceNewAccumulator(accumulator, 1575 ExportReduceNewList[i].mSignature, 1576 ExportReduceNewList[i].mInputCount); 1577 } 1578 1579 if (gEnableRsTbaa && !allocPointersExposed(Module)) { 1580 connectRenderScriptTBAAMetadata(Module); 1581 } 1582 1583 return Changed; 1584 } 1585 1586 virtual const char *getPassName() const { 1587 return "forEach_* and reduce_* function expansion"; 1588 } 1589 1590}; // end RSKernelExpandPass 1591 1592} // end anonymous namespace 1593 1594char RSKernelExpandPass::ID = 0; 1595static llvm::RegisterPass<RSKernelExpandPass> X("kernelexp", "Kernel Expand Pass"); 1596 1597namespace bcc { 1598 1599const char BCC_INDEX_VAR_NAME[] = "rsIndex"; 1600 1601llvm::ModulePass * 1602createRSKernelExpandPass(bool pEnableStepOpt) { 1603 return new RSKernelExpandPass(pEnableStepOpt); 1604} 1605 1606} // end namespace bcc 1607