RSKernelExpand.cpp revision c6c9c1f04b480a395daa1bdd5d634060e505bd80
1/* 2 * Copyright 2012, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "bcc/Assert.h" 18#include "bcc/Renderscript/RSTransforms.h" 19#include "bcc/Renderscript/RSUtils.h" 20 21#include <cstdlib> 22#include <functional> 23#include <unordered_set> 24 25#include <llvm/IR/DerivedTypes.h> 26#include <llvm/IR/Function.h> 27#include <llvm/IR/Instructions.h> 28#include <llvm/IR/IRBuilder.h> 29#include <llvm/IR/MDBuilder.h> 30#include <llvm/IR/Module.h> 31#include <llvm/Pass.h> 32#include <llvm/Support/raw_ostream.h> 33#include <llvm/IR/DataLayout.h> 34#include <llvm/IR/Function.h> 35#include <llvm/IR/Type.h> 36#include <llvm/Transforms/Utils/BasicBlockUtils.h> 37 38#include "bcc/Config/Config.h" 39#include "bcc/Support/Log.h" 40 41#include "bcinfo/MetadataExtractor.h" 42 43#ifndef __DISABLE_ASSERTS 44// Only used in bccAssert() 45const int kNumExpandedForeachParams = 4; 46const int kNumExpandedReduceParams = 3; 47const int kNumExpandedReduceNewAccumulatorParams = 4; 48#endif 49 50const char kRenderScriptTBAARootName[] = "RenderScript Distinct TBAA"; 51const char kRenderScriptTBAANodeName[] = "RenderScript TBAA"; 52 53using namespace bcc; 54 55namespace { 56 57static const bool gEnableRsTbaa = true; 58 59/* RSKernelExpandPass - This pass operates on functions that are able 60 * to be called via rsForEach(), "foreach_<NAME>", or 61 * "reduce_<NAME>". We create an inner loop for the function to be 62 * invoked over the appropriate data cells of the input/output 63 * allocations (adjusting other relevant parameters as we go). We 64 * support doing this for any forEach or reduce style compute 65 * kernels. The new function name is the original function name 66 * followed by ".expand". Note that we still generate code for the 67 * original function. 68 */ 69class RSKernelExpandPass : public llvm::ModulePass { 70public: 71 static char ID; 72 73private: 74 static const size_t RS_KERNEL_INPUT_LIMIT = 8; // see frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h 75 76 typedef std::unordered_set<llvm::Function *> FunctionSet; 77 78 enum RsLaunchDimensionsField { 79 RsLaunchDimensionsFieldX, 80 RsLaunchDimensionsFieldY, 81 RsLaunchDimensionsFieldZ, 82 RsLaunchDimensionsFieldLod, 83 RsLaunchDimensionsFieldFace, 84 RsLaunchDimensionsFieldArray, 85 86 RsLaunchDimensionsFieldCount 87 }; 88 89 enum RsExpandKernelDriverInfoPfxField { 90 RsExpandKernelDriverInfoPfxFieldInPtr, 91 RsExpandKernelDriverInfoPfxFieldInStride, 92 RsExpandKernelDriverInfoPfxFieldInLen, 93 RsExpandKernelDriverInfoPfxFieldOutPtr, 94 RsExpandKernelDriverInfoPfxFieldOutStride, 95 RsExpandKernelDriverInfoPfxFieldOutLen, 96 RsExpandKernelDriverInfoPfxFieldDim, 97 RsExpandKernelDriverInfoPfxFieldCurrent, 98 RsExpandKernelDriverInfoPfxFieldUsr, 99 RsExpandKernelDriverInfoPfxFieldUsLenr, 100 101 RsExpandKernelDriverInfoPfxFieldCount 102 }; 103 104 llvm::Module *Module; 105 llvm::LLVMContext *Context; 106 107 /* 108 * Pointers to LLVM type information for the the function signatures 109 * for expanded functions. These must be re-calculated for each module 110 * the pass is run on. 111 */ 112 llvm::FunctionType *ExpandedForEachType, *ExpandedReduceType; 113 llvm::Type *RsExpandKernelDriverInfoPfxTy; 114 115 uint32_t mExportForEachCount; 116 const char **mExportForEachNameList; 117 const uint32_t *mExportForEachSignatureList; 118 119 uint32_t mExportReduceCount; 120 const char **mExportReduceNameList; 121 122 // Turns on optimization of allocation stride values. 123 bool mEnableStepOpt; 124 125 uint32_t getRootSignature(llvm::Function *Function) { 126 const llvm::NamedMDNode *ExportForEachMetadata = 127 Module->getNamedMetadata("#rs_export_foreach"); 128 129 if (!ExportForEachMetadata) { 130 llvm::SmallVector<llvm::Type*, 8> RootArgTys; 131 for (llvm::Function::arg_iterator B = Function->arg_begin(), 132 E = Function->arg_end(); 133 B != E; 134 ++B) { 135 RootArgTys.push_back(B->getType()); 136 } 137 138 // For pre-ICS bitcode, we may not have signature information. In that 139 // case, we use the size of the RootArgTys to select the number of 140 // arguments. 141 return (1 << RootArgTys.size()) - 1; 142 } 143 144 if (ExportForEachMetadata->getNumOperands() == 0) { 145 return 0; 146 } 147 148 bccAssert(ExportForEachMetadata->getNumOperands() > 0); 149 150 // We only handle the case for legacy root() functions here, so this is 151 // hard-coded to look at only the first such function. 152 llvm::MDNode *SigNode = ExportForEachMetadata->getOperand(0); 153 if (SigNode != nullptr && SigNode->getNumOperands() == 1) { 154 llvm::Metadata *SigMD = SigNode->getOperand(0); 155 if (llvm::MDString *SigS = llvm::dyn_cast<llvm::MDString>(SigMD)) { 156 llvm::StringRef SigString = SigS->getString(); 157 uint32_t Signature = 0; 158 if (SigString.getAsInteger(10, Signature)) { 159 ALOGE("Non-integer signature value '%s'", SigString.str().c_str()); 160 return 0; 161 } 162 return Signature; 163 } 164 } 165 166 return 0; 167 } 168 169 bool isStepOptSupported(llvm::Type *AllocType) { 170 171 llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType); 172 llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context); 173 174 if (mEnableStepOpt) { 175 return false; 176 } 177 178 if (AllocType == VoidPtrTy) { 179 return false; 180 } 181 182 if (!PT) { 183 return false; 184 } 185 186 // remaining conditions are 64-bit only 187 if (VoidPtrTy->getPrimitiveSizeInBits() == 32) { 188 return true; 189 } 190 191 // coerce suggests an upconverted struct type, which we can't support 192 if (AllocType->getStructName().find("coerce") != llvm::StringRef::npos) { 193 return false; 194 } 195 196 // 2xi64 and i128 suggest an upconverted struct type, which are also unsupported 197 llvm::Type *V2xi64Ty = llvm::VectorType::get(llvm::Type::getInt64Ty(*Context), 2); 198 llvm::Type *Int128Ty = llvm::Type::getIntNTy(*Context, 128); 199 if (AllocType == V2xi64Ty || AllocType == Int128Ty) { 200 return false; 201 } 202 203 return true; 204 } 205 206 // Get the actual value we should use to step through an allocation. 207 // 208 // Normally the value we use to step through an allocation is given to us by 209 // the driver. However, for certain primitive data types, we can derive an 210 // integer constant for the step value. We use this integer constant whenever 211 // possible to allow further compiler optimizations to take place. 212 // 213 // DL - Target Data size/layout information. 214 // T - Type of allocation (should be a pointer). 215 // OrigStep - Original step increment (root.expand() input from driver). 216 llvm::Value *getStepValue(llvm::DataLayout *DL, llvm::Type *AllocType, 217 llvm::Value *OrigStep) { 218 bccAssert(DL); 219 bccAssert(AllocType); 220 bccAssert(OrigStep); 221 llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType); 222 if (isStepOptSupported(AllocType)) { 223 llvm::Type *ET = PT->getElementType(); 224 uint64_t ETSize = DL->getTypeAllocSize(ET); 225 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 226 return llvm::ConstantInt::get(Int32Ty, ETSize); 227 } else { 228 return OrigStep; 229 } 230 } 231 232 /// Builds the types required by the pass for the given context. 233 void buildTypes(void) { 234 // Create the RsLaunchDimensionsTy and RsExpandKernelDriverInfoPfxTy structs. 235 236 llvm::Type *Int8Ty = llvm::Type::getInt8Ty(*Context); 237 llvm::Type *Int8PtrTy = Int8Ty->getPointerTo(); 238 llvm::Type *Int8PtrArrayInputLimitTy = llvm::ArrayType::get(Int8PtrTy, RS_KERNEL_INPUT_LIMIT); 239 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 240 llvm::Type *Int32ArrayInputLimitTy = llvm::ArrayType::get(Int32Ty, RS_KERNEL_INPUT_LIMIT); 241 llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context); 242 llvm::Type *Int32Array4Ty = llvm::ArrayType::get(Int32Ty, 4); 243 244 /* Defined in frameworks/base/libs/rs/cpu_ref/rsCpuCore.h: 245 * 246 * struct RsLaunchDimensions { 247 * uint32_t x; 248 * uint32_t y; 249 * uint32_t z; 250 * uint32_t lod; 251 * uint32_t face; 252 * uint32_t array[4]; 253 * }; 254 */ 255 llvm::SmallVector<llvm::Type*, RsLaunchDimensionsFieldCount> RsLaunchDimensionsTypes; 256 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t x 257 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t y 258 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t z 259 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t lod 260 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t face 261 RsLaunchDimensionsTypes.push_back(Int32Array4Ty); // uint32_t array[4] 262 llvm::StructType *RsLaunchDimensionsTy = 263 llvm::StructType::create(RsLaunchDimensionsTypes, "RsLaunchDimensions"); 264 265 /* Defined as the beginning of RsExpandKernelDriverInfo in frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h: 266 * 267 * struct RsExpandKernelDriverInfoPfx { 268 * const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT]; 269 * uint32_t inStride[RS_KERNEL_INPUT_LIMIT]; 270 * uint32_t inLen; 271 * 272 * uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT]; 273 * uint32_t outStride[RS_KERNEL_INPUT_LIMIT]; 274 * uint32_t outLen; 275 * 276 * // Dimension of the launch 277 * RsLaunchDimensions dim; 278 * 279 * // The walking iterator of the launch 280 * RsLaunchDimensions current; 281 * 282 * const void *usr; 283 * uint32_t usrLen; 284 * 285 * // Items below this line are not used by the compiler and can be change in the driver. 286 * // So the compiler must assume there are an unknown number of fields of unknown type 287 * // beginning here. 288 * }; 289 * 290 * The name "RsExpandKernelDriverInfoPfx" is known to RSInvariantPass (RSInvariant.cpp). 291 */ 292 llvm::SmallVector<llvm::Type*, RsExpandKernelDriverInfoPfxFieldCount> RsExpandKernelDriverInfoPfxTypes; 293 RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT] 294 RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy); // uint32_t inStride[RS_KERNEL_INPUT_LIMIT] 295 RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t inLen 296 RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT] 297 RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy); // uint32_t outStride[RS_KERNEL_INPUT_LIMIT] 298 RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t outLen 299 RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy); // RsLaunchDimensions dim 300 RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy); // RsLaunchDimensions current 301 RsExpandKernelDriverInfoPfxTypes.push_back(VoidPtrTy); // const void *usr 302 RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t usrLen 303 RsExpandKernelDriverInfoPfxTy = 304 llvm::StructType::create(RsExpandKernelDriverInfoPfxTypes, "RsExpandKernelDriverInfoPfx"); 305 306 // Create the function type for expanded kernels. 307 llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context); 308 309 llvm::Type *RsExpandKernelDriverInfoPfxPtrTy = RsExpandKernelDriverInfoPfxTy->getPointerTo(); 310 // void (const RsExpandKernelDriverInfoPfxTy *p, uint32_t x1, uint32_t x2, uint32_t outstep) 311 ExpandedForEachType = llvm::FunctionType::get(VoidTy, 312 {RsExpandKernelDriverInfoPfxPtrTy, Int32Ty, Int32Ty, Int32Ty}, false); 313 314 // void (void *inBuf, void *outBuf, uint32_t len) 315 ExpandedReduceType = llvm::FunctionType::get(VoidTy, {VoidPtrTy, VoidPtrTy, Int32Ty}, false); 316 } 317 318 /// @brief Create skeleton of the expanded foreach kernel. 319 /// 320 /// This creates a function with the following signature: 321 /// 322 /// void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2, 323 /// uint32_t outstep) 324 /// 325 llvm::Function *createEmptyExpandedForEachKernel(llvm::StringRef OldName) { 326 llvm::Function *ExpandedFunction = 327 llvm::Function::Create(ExpandedForEachType, 328 llvm::GlobalValue::ExternalLinkage, 329 OldName + ".expand", Module); 330 bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams); 331 llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin(); 332 (AI++)->setName("p"); 333 (AI++)->setName("x1"); 334 (AI++)->setName("x2"); 335 (AI++)->setName("arg_outstep"); 336 llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin", 337 ExpandedFunction); 338 llvm::IRBuilder<> Builder(Begin); 339 Builder.CreateRetVoid(); 340 return ExpandedFunction; 341 } 342 343 // Create skeleton of the expanded reduce kernel. 344 // 345 // This creates a function with the following signature: 346 // 347 // void @func.expand(i8* nocapture %inBuf, i8* nocapture %outBuf, i32 len) 348 // 349 llvm::Function *createEmptyExpandedReduceKernel(llvm::StringRef OldName) { 350 llvm::Function *ExpandedFunction = 351 llvm::Function::Create(ExpandedReduceType, 352 llvm::GlobalValue::ExternalLinkage, 353 OldName + ".expand", Module); 354 bccAssert(ExpandedFunction->arg_size() == kNumExpandedReduceParams); 355 356 llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin(); 357 358 using llvm::Attribute; 359 360 llvm::Argument *InBuf = &(*AI++); 361 InBuf->setName("inBuf"); 362 InBuf->addAttr(llvm::AttributeSet::get(*Context, InBuf->getArgNo() + 1, llvm::makeArrayRef(Attribute::NoCapture))); 363 364 llvm::Argument *OutBuf = &(*AI++); 365 OutBuf->setName("outBuf"); 366 OutBuf->addAttr(llvm::AttributeSet::get(*Context, OutBuf->getArgNo() + 1, llvm::makeArrayRef(Attribute::NoCapture))); 367 368 (AI++)->setName("len"); 369 370 llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin", 371 ExpandedFunction); 372 llvm::IRBuilder<> Builder(Begin); 373 Builder.CreateRetVoid(); 374 375 return ExpandedFunction; 376 } 377 378 // Create skeleton of a general reduce kernel's expanded accumulator. 379 // 380 // This creates a function with the following signature: 381 // 382 // void @func.expand(%RsExpandKernelDriverInfoPfx* nocapture %p, 383 // i32 %x1, i32 %x2, accumType* nocapture %accum) 384 // 385 llvm::Function *createEmptyExpandedReduceNewAccumulator(llvm::StringRef OldName, 386 llvm::Type *AccumArgTy) { 387 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 388 llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context); 389 llvm::FunctionType *ExpandedReduceNewAccumulatorType = 390 llvm::FunctionType::get(VoidTy, 391 {RsExpandKernelDriverInfoPfxTy->getPointerTo(), 392 Int32Ty, Int32Ty, AccumArgTy}, false); 393 llvm::Function *FnExpandedAccumulator = 394 llvm::Function::Create(ExpandedReduceNewAccumulatorType, 395 llvm::GlobalValue::ExternalLinkage, 396 OldName + ".expand", Module); 397 bccAssert(FnExpandedAccumulator->arg_size() == kNumExpandedReduceNewAccumulatorParams); 398 399 llvm::Function::arg_iterator AI = FnExpandedAccumulator->arg_begin(); 400 401 using llvm::Attribute; 402 403 llvm::Argument *Arg_p = &(*AI++); 404 Arg_p->setName("p"); 405 Arg_p->addAttr(llvm::AttributeSet::get(*Context, Arg_p->getArgNo() + 1, 406 llvm::makeArrayRef(Attribute::NoCapture))); 407 408 llvm::Argument *Arg_x1 = &(*AI++); 409 Arg_x1->setName("x1"); 410 411 llvm::Argument *Arg_x2 = &(*AI++); 412 Arg_x2->setName("x2"); 413 414 llvm::Argument *Arg_accum = &(*AI++); 415 Arg_accum->setName("accum"); 416 Arg_accum->addAttr(llvm::AttributeSet::get(*Context, Arg_accum->getArgNo() + 1, 417 llvm::makeArrayRef(Attribute::NoCapture))); 418 419 llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin", 420 FnExpandedAccumulator); 421 llvm::IRBuilder<> Builder(Begin); 422 Builder.CreateRetVoid(); 423 424 return FnExpandedAccumulator; 425 } 426 427 /// @brief Create an empty loop 428 /// 429 /// Create a loop of the form: 430 /// 431 /// for (i = LowerBound; i < UpperBound; i++) 432 /// ; 433 /// 434 /// After the loop has been created, the builder is set such that 435 /// instructions can be added to the loop body. 436 /// 437 /// @param Builder The builder to use to build this loop. The current 438 /// position of the builder is the position the loop 439 /// will be inserted. 440 /// @param LowerBound The first value of the loop iterator 441 /// @param UpperBound The maximal value of the loop iterator 442 /// @param LoopIV A reference that will be set to the loop iterator. 443 /// @return The BasicBlock that will be executed after the loop. 444 llvm::BasicBlock *createLoop(llvm::IRBuilder<> &Builder, 445 llvm::Value *LowerBound, 446 llvm::Value *UpperBound, 447 llvm::Value **LoopIV) { 448 bccAssert(LowerBound->getType() == UpperBound->getType()); 449 450 llvm::BasicBlock *CondBB, *AfterBB, *HeaderBB; 451 llvm::Value *Cond, *IVNext, *IV, *IVVar; 452 453 CondBB = Builder.GetInsertBlock(); 454 AfterBB = llvm::SplitBlock(CondBB, &*Builder.GetInsertPoint(), nullptr, nullptr); 455 HeaderBB = llvm::BasicBlock::Create(*Context, "Loop", CondBB->getParent()); 456 457 CondBB->getTerminator()->eraseFromParent(); 458 Builder.SetInsertPoint(CondBB); 459 460 // decltype(LowerBound) *ivvar = alloca(sizeof(int)) 461 // *ivvar = LowerBound 462 IVVar = Builder.CreateAlloca(LowerBound->getType(), nullptr, BCC_INDEX_VAR_NAME); 463 Builder.CreateStore(LowerBound, IVVar); 464 465 // if (LowerBound < Upperbound) 466 // goto LoopHeader 467 // else 468 // goto AfterBB 469 Cond = Builder.CreateICmpULT(LowerBound, UpperBound); 470 Builder.CreateCondBr(Cond, HeaderBB, AfterBB); 471 472 // LoopHeader: 473 // iv = *ivvar 474 // <insertion point here> 475 // iv.next = iv + 1 476 // *ivvar = iv.next 477 // if (iv.next < Upperbound) 478 // goto LoopHeader 479 // else 480 // goto AfterBB 481 // AfterBB: 482 Builder.SetInsertPoint(HeaderBB); 483 IV = Builder.CreateLoad(IVVar, "X"); 484 IVNext = Builder.CreateNUWAdd(IV, Builder.getInt32(1)); 485 Builder.CreateStore(IVNext, IVVar); 486 Cond = Builder.CreateICmpULT(IVNext, UpperBound); 487 Builder.CreateCondBr(Cond, HeaderBB, AfterBB); 488 AfterBB->setName("Exit"); 489 Builder.SetInsertPoint(llvm::cast<llvm::Instruction>(IVNext)); 490 491 // Record information about this loop. 492 *LoopIV = IV; 493 return AfterBB; 494 } 495 496 // Finish building the outgoing argument list for calling a ForEach-able function. 497 // 498 // ArgVector - on input, the non-special arguments 499 // on output, the non-special arguments combined with the special arguments 500 // from SpecialArgVector 501 // SpecialArgVector - special arguments (from ExpandSpecialArguments()) 502 // SpecialArgContextIdx - return value of ExpandSpecialArguments() 503 // (position of context argument in SpecialArgVector) 504 // CalleeFunction - the ForEach-able function being called 505 // Builder - for inserting code into the caller function 506 template<unsigned int ArgVectorLen, unsigned int SpecialArgVectorLen> 507 void finishArgList( llvm::SmallVector<llvm::Value *, ArgVectorLen> &ArgVector, 508 const llvm::SmallVector<llvm::Value *, SpecialArgVectorLen> &SpecialArgVector, 509 const int SpecialArgContextIdx, 510 const llvm::Function &CalleeFunction, 511 llvm::IRBuilder<> &CallerBuilder) { 512 /* The context argument (if any) is a pointer to an opaque user-visible type that differs from 513 * the RsExpandKernelDriverInfoPfx type used in the function we are generating (although the 514 * two types represent the same thing). Therefore, we must introduce a pointer cast when 515 * generating a call to the kernel function. 516 */ 517 const int ArgContextIdx = 518 SpecialArgContextIdx >= 0 ? (ArgVector.size() + SpecialArgContextIdx) : SpecialArgContextIdx; 519 ArgVector.append(SpecialArgVector.begin(), SpecialArgVector.end()); 520 if (ArgContextIdx >= 0) { 521 llvm::Type *ContextArgType = nullptr; 522 int ArgIdx = ArgContextIdx; 523 for (const auto &Arg : CalleeFunction.getArgumentList()) { 524 if (!ArgIdx--) { 525 ContextArgType = Arg.getType(); 526 break; 527 } 528 } 529 bccAssert(ContextArgType); 530 ArgVector[ArgContextIdx] = CallerBuilder.CreatePointerCast(ArgVector[ArgContextIdx], ContextArgType); 531 } 532 } 533 534 // GEPHelper() returns a SmallVector of values suitable for passing 535 // to IRBuilder::CreateGEP(), and SmallGEPIndices is a typedef for 536 // the returned data type. It is sized so that the SmallVector 537 // returned by GEPHelper() never needs to do a heap allocation for 538 // any list of GEP indices it encounters in the code. 539 typedef llvm::SmallVector<llvm::Value *, 3> SmallGEPIndices; 540 541 // Helper for turning a list of constant integer GEP indices into a 542 // SmallVector of llvm::Value*. The return value is suitable for 543 // passing to a GetElementPtrInst constructor or IRBuilder::CreateGEP(). 544 // 545 // Inputs: 546 // I32Args should be integers which represent the index arguments 547 // to a GEP instruction. 548 // 549 // Returns: 550 // Returns a SmallVector of ConstantInts. 551 SmallGEPIndices GEPHelper(const std::initializer_list<int32_t> I32Args) { 552 SmallGEPIndices Out(I32Args.size()); 553 llvm::IntegerType *I32Ty = llvm::Type::getInt32Ty(*Context); 554 std::transform(I32Args.begin(), I32Args.end(), Out.begin(), 555 [I32Ty](int32_t Arg) { return llvm::ConstantInt::get(I32Ty, Arg); }); 556 return Out; 557 } 558 559public: 560 RSKernelExpandPass(bool pEnableStepOpt = true) 561 : ModulePass(ID), Module(nullptr), Context(nullptr), 562 mEnableStepOpt(pEnableStepOpt) { 563 564 } 565 566 virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override { 567 // This pass does not use any other analysis passes, but it does 568 // add/wrap the existing functions in the module (thus altering the CFG). 569 } 570 571 // Build contribution to outgoing argument list for calling a 572 // ForEach-able function or a general reduction accumulator 573 // function, based on the special parameters of that function. 574 // 575 // Signature - metadata bits for the signature of the callee 576 // X, Arg_p - values derived directly from expanded function, 577 // suitable for computing arguments for the callee 578 // CalleeArgs - contribution is accumulated here 579 // Bump - invoked once for each contributed outgoing argument 580 // LoopHeaderInsertionPoint - an Instruction in the loop header, before which 581 // this function can insert loop-invariant loads 582 // 583 // Return value is the (zero-based) position of the context (Arg_p) 584 // argument in the CalleeArgs vector, or a negative value if the 585 // context argument is not placed in the CalleeArgs vector. 586 int ExpandSpecialArguments(uint32_t Signature, 587 llvm::Value *X, 588 llvm::Value *Arg_p, 589 llvm::IRBuilder<> &Builder, 590 llvm::SmallVector<llvm::Value*, 8> &CalleeArgs, 591 std::function<void ()> Bump, 592 llvm::Instruction *LoopHeaderInsertionPoint) { 593 594 bccAssert(CalleeArgs.empty()); 595 596 int Return = -1; 597 if (bcinfo::MetadataExtractor::hasForEachSignatureCtxt(Signature)) { 598 CalleeArgs.push_back(Arg_p); 599 Bump(); 600 Return = CalleeArgs.size() - 1; 601 } 602 603 if (bcinfo::MetadataExtractor::hasForEachSignatureX(Signature)) { 604 CalleeArgs.push_back(X); 605 Bump(); 606 } 607 608 if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature) || 609 bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) { 610 bccAssert(LoopHeaderInsertionPoint); 611 612 // Y and Z are loop invariant, so they can be hoisted out of the 613 // loop. Set the IRBuilder insertion point to the loop header. 614 auto OldInsertionPoint = Builder.saveIP(); 615 Builder.SetInsertPoint(LoopHeaderInsertionPoint); 616 617 if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature)) { 618 SmallGEPIndices YValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent, 619 RsLaunchDimensionsFieldY})); 620 llvm::Value *YAddr = Builder.CreateInBoundsGEP(Arg_p, YValueGEP, "Y.gep"); 621 CalleeArgs.push_back(Builder.CreateLoad(YAddr, "Y")); 622 Bump(); 623 } 624 625 if (bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) { 626 SmallGEPIndices ZValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent, 627 RsLaunchDimensionsFieldZ})); 628 llvm::Value *ZAddr = Builder.CreateInBoundsGEP(Arg_p, ZValueGEP, "Z.gep"); 629 CalleeArgs.push_back(Builder.CreateLoad(ZAddr, "Z")); 630 Bump(); 631 } 632 633 Builder.restoreIP(OldInsertionPoint); 634 } 635 636 return Return; 637 } 638 639 // Generate loop-invariant input processing setup code for an expanded 640 // ForEach-able function or an expanded general reduction accumulator 641 // function. 642 // 643 // LoopHeader - block at the end of which the setup code will be inserted 644 // Arg_p - RSKernelDriverInfo pointer passed to the expanded function 645 // TBAAPointer - metadata for marking loads of pointer values out of RSKernelDriverInfo 646 // ArgIter - iterator pointing to first input of the UNexpanded function 647 // NumInputs - number of inputs (NOT number of ARGUMENTS) 648 // 649 // InTypes[] - this function saves input type, they will be used in ExpandInputsBody(). 650 // InBufPtrs[] - this function sets each array element to point to the first cell / byte 651 // (byte for x86, cell for other platforms) of the corresponding input allocation 652 // InStructTempSlots[] - this function sets each array element either to nullptr 653 // or to the result of an alloca (for the case where the 654 // calling convention dictates that a value must be passed 655 // by reference, and so we need a stacked temporary to hold 656 // a copy of that value) 657 void ExpandInputsLoopInvariant(llvm::IRBuilder<> &Builder, llvm::BasicBlock *LoopHeader, 658 llvm::Value *Arg_p, 659 llvm::MDNode *TBAAPointer, 660 llvm::Function::arg_iterator ArgIter, 661 const size_t NumInputs, 662 llvm::SmallVectorImpl<llvm::Type *> &InTypes, 663 llvm::SmallVectorImpl<llvm::Value *> &InBufPtrs, 664 llvm::SmallVectorImpl<llvm::Value *> &InStructTempSlots) { 665 bccAssert(NumInputs <= RS_KERNEL_INPUT_LIMIT); 666 667 // Extract information about input slots. The work done 668 // here is loop-invariant, so we can hoist the operations out of the loop. 669 auto OldInsertionPoint = Builder.saveIP(); 670 Builder.SetInsertPoint(LoopHeader->getTerminator()); 671 672 for (size_t InputIndex = 0; InputIndex < NumInputs; ++InputIndex, ArgIter++) { 673 llvm::Type *InType = ArgIter->getType(); 674 675 /* 676 * AArch64 calling conventions dictate that structs of sufficient size 677 * get passed by pointer instead of passed by value. This, combined 678 * with the fact that we don't allow kernels to operate on pointer 679 * data means that if we see a kernel with a pointer parameter we know 680 * that it is a struct input that has been promoted. As such we don't 681 * need to convert its type to a pointer. Later we will need to know 682 * to create a temporary copy on the stack, so we save this information 683 * in InStructTempSlots. 684 */ 685 if (auto PtrType = llvm::dyn_cast<llvm::PointerType>(InType)) { 686 llvm::Type *ElementType = PtrType->getElementType(); 687 InStructTempSlots.push_back(Builder.CreateAlloca(ElementType, nullptr, 688 "input_struct_slot")); 689 } else { 690 InType = InType->getPointerTo(); 691 InStructTempSlots.push_back(nullptr); 692 } 693 694 SmallGEPIndices InBufPtrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr, 695 static_cast<int32_t>(InputIndex)})); 696 llvm::Value *InBufPtrAddr = Builder.CreateInBoundsGEP(Arg_p, InBufPtrGEP, "input_buf.gep"); 697 llvm::LoadInst *InBufPtr = Builder.CreateLoad(InBufPtrAddr, "input_buf"); 698 699 llvm::Value *CastInBufPtr = nullptr; 700 if (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING) { 701 CastInBufPtr = Builder.CreatePointerCast(InBufPtr, InType, "casted_in"); 702 } else { 703 // The disagreement between module and x86 target machine datalayout 704 // causes mismatched input/output data offset between slang reflected 705 // code and bcc codegen for GetElementPtr. To solve this issue, skip the 706 // cast to InType and leave CastInBufPtr as an int8_t*. The buffer is 707 // later indexed with an explicit byte offset computed based on 708 // X86_CUSTOM_DL_STRING and then bitcast it to actual input type. 709 CastInBufPtr = InBufPtr; 710 } 711 712 if (gEnableRsTbaa) { 713 InBufPtr->setMetadata("tbaa", TBAAPointer); 714 } 715 716 InTypes.push_back(InType); 717 InBufPtrs.push_back(CastInBufPtr); 718 } 719 720 Builder.restoreIP(OldInsertionPoint); 721 } 722 723 // Generate loop-varying input processing code for an expanded ForEach-able function 724 // or an expanded general reduction accumulator function. Also, for the call to the 725 // UNexpanded function, collect the portion of the argument list corresponding to the 726 // inputs. 727 // 728 // Arg_x1 - first X coordinate to be processed by the expanded function 729 // TBAAAllocation - metadata for marking loads of input values out of allocations 730 // NumInputs -- number of inputs (NOT number of ARGUMENTS) 731 // InTypes[] - this function uses the saved input types in ExpandInputsLoopInvariant() 732 // to convert the pointer of byte InPtr to its real type. 733 // InBufPtrs[] - this function consumes the information produced by ExpandInputsLoopInvariant() 734 // InStructTempSlots[] - this function consumes the information produced by ExpandInputsLoopInvariant() 735 // IndVar - value of loop induction variable (X coordinate) for a given loop iteration 736 // 737 // RootArgs - this function sets this to the list of outgoing argument values corresponding 738 // to the inputs 739 void ExpandInputsBody(llvm::IRBuilder<> &Builder, 740 llvm::Value *Arg_x1, 741 llvm::MDNode *TBAAAllocation, 742 const size_t NumInputs, 743 const llvm::SmallVectorImpl<llvm::Type *> &InTypes, 744 const llvm::SmallVectorImpl<llvm::Value *> &InBufPtrs, 745 const llvm::SmallVectorImpl<llvm::Value *> &InStructTempSlots, 746 llvm::Value *IndVar, 747 llvm::SmallVectorImpl<llvm::Value *> &RootArgs) { 748 llvm::Value *Offset = Builder.CreateSub(IndVar, Arg_x1); 749 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 750 751 for (size_t Index = 0; Index < NumInputs; ++Index) { 752 753 llvm::Value *InPtr = nullptr; 754 if (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING) { 755 InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], Offset); 756 } else { 757 // Treat x86 input buffer as byte[], get indexed pointer with explicit 758 // byte offset computed using a datalayout based on 759 // X86_CUSTOM_DL_STRING, then bitcast it to actual input type. 760 llvm::DataLayout DL(X86_CUSTOM_DL_STRING); 761 llvm::Type *InTy = InTypes[Index]; 762 uint64_t InStep = DL.getTypeAllocSize(InTy->getPointerElementType()); 763 llvm::Value *OffsetInBytes = Builder.CreateMul(Offset, llvm::ConstantInt::get(Int32Ty, InStep)); 764 InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], OffsetInBytes); 765 InPtr = Builder.CreatePointerCast(InPtr, InTy); 766 } 767 768 llvm::Value *Input; 769 llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input"); 770 771 if (gEnableRsTbaa) { 772 InputLoad->setMetadata("tbaa", TBAAAllocation); 773 } 774 775 if (llvm::Value *TemporarySlot = InStructTempSlots[Index]) { 776 // Pass a pointer to a temporary on the stack, rather than 777 // passing a pointer to the original value. We do not want 778 // the kernel to potentially modify the input data. 779 780 // Note: don't annotate with TBAA, since the kernel might 781 // have its own TBAA annotations for the pointer argument. 782 Builder.CreateStore(InputLoad, TemporarySlot); 783 Input = TemporarySlot; 784 } else { 785 Input = InputLoad; 786 } 787 788 RootArgs.push_back(Input); 789 } 790 } 791 792 /* Performs the actual optimization on a selected function. On success, the 793 * Module will contain a new function of the name "<NAME>.expand" that 794 * invokes <NAME>() in a loop with the appropriate parameters. 795 */ 796 bool ExpandOldStyleForEach(llvm::Function *Function, uint32_t Signature) { 797 ALOGV("Expanding ForEach-able Function %s", 798 Function->getName().str().c_str()); 799 800 if (!Signature) { 801 Signature = getRootSignature(Function); 802 if (!Signature) { 803 // We couldn't determine how to expand this function based on its 804 // function signature. 805 return false; 806 } 807 } 808 809 llvm::DataLayout DL(Module); 810 if (Module->getTargetTriple() == DEFAULT_X86_TRIPLE_STRING) { 811 DL.reset(X86_CUSTOM_DL_STRING); 812 } 813 814 llvm::Function *ExpandedFunction = 815 createEmptyExpandedForEachKernel(Function->getName()); 816 817 /* 818 * Extract the expanded function's parameters. It is guaranteed by 819 * createEmptyExpandedForEachKernel that there will be four parameters. 820 */ 821 822 bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams); 823 824 llvm::Function::arg_iterator ExpandedFunctionArgIter = 825 ExpandedFunction->arg_begin(); 826 827 llvm::Value *Arg_p = &*(ExpandedFunctionArgIter++); 828 llvm::Value *Arg_x1 = &*(ExpandedFunctionArgIter++); 829 llvm::Value *Arg_x2 = &*(ExpandedFunctionArgIter++); 830 llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter); 831 832 llvm::Value *InStep = nullptr; 833 llvm::Value *OutStep = nullptr; 834 835 // Construct the actual function body. 836 llvm::IRBuilder<> Builder(&*ExpandedFunction->getEntryBlock().begin()); 837 838 // Collect and construct the arguments for the kernel(). 839 // Note that we load any loop-invariant arguments before entering the Loop. 840 llvm::Function::arg_iterator FunctionArgIter = Function->arg_begin(); 841 842 llvm::Type *InTy = nullptr; 843 llvm::Value *InBufPtr = nullptr; 844 if (bcinfo::MetadataExtractor::hasForEachSignatureIn(Signature)) { 845 SmallGEPIndices InStepGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInStride, 0})); 846 llvm::LoadInst *InStepArg = Builder.CreateLoad( 847 Builder.CreateInBoundsGEP(Arg_p, InStepGEP, "instep_addr.gep"), "instep_addr"); 848 849 InTy = (FunctionArgIter++)->getType(); 850 InStep = getStepValue(&DL, InTy, InStepArg); 851 852 InStep->setName("instep"); 853 854 SmallGEPIndices InputAddrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr, 0})); 855 InBufPtr = Builder.CreateLoad( 856 Builder.CreateInBoundsGEP(Arg_p, InputAddrGEP, "input_buf.gep"), "input_buf"); 857 } 858 859 llvm::Type *OutTy = nullptr; 860 llvm::Value *OutBasePtr = nullptr; 861 if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) { 862 OutTy = (FunctionArgIter++)->getType(); 863 OutStep = getStepValue(&DL, OutTy, Arg_outstep); 864 OutStep->setName("outstep"); 865 SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0})); 866 OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep")); 867 } 868 869 llvm::Value *UsrData = nullptr; 870 if (bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)) { 871 llvm::Type *UsrDataTy = (FunctionArgIter++)->getType(); 872 llvm::Value *UsrDataPointerAddr = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldUsr); 873 UsrData = Builder.CreatePointerCast(Builder.CreateLoad(UsrDataPointerAddr), UsrDataTy); 874 UsrData->setName("UsrData"); 875 } 876 877 llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock(); 878 llvm::Value *IV; 879 createLoop(Builder, Arg_x1, Arg_x2, &IV); 880 881 llvm::SmallVector<llvm::Value*, 8> CalleeArgs; 882 const int CalleeArgsContextIdx = ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs, 883 [&FunctionArgIter]() { FunctionArgIter++; }, 884 LoopHeader->getTerminator()); 885 886 bccAssert(FunctionArgIter == Function->arg_end()); 887 888 // Populate the actual call to kernel(). 889 llvm::SmallVector<llvm::Value*, 8> RootArgs; 890 891 llvm::Value *InPtr = nullptr; 892 llvm::Value *OutPtr = nullptr; 893 894 // Calculate the current input and output pointers 895 // 896 // We always calculate the input/output pointers with a GEP operating on i8 897 // values and only cast at the very end to OutTy. This is because the step 898 // between two values is given in bytes. 899 // 900 // TODO: We could further optimize the output by using a GEP operation of 901 // type 'OutTy' in cases where the element type of the allocation allows. 902 if (OutBasePtr) { 903 llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1); 904 OutOffset = Builder.CreateMul(OutOffset, OutStep); 905 OutPtr = Builder.CreateInBoundsGEP(OutBasePtr, OutOffset); 906 OutPtr = Builder.CreatePointerCast(OutPtr, OutTy); 907 } 908 909 if (InBufPtr) { 910 llvm::Value *InOffset = Builder.CreateSub(IV, Arg_x1); 911 InOffset = Builder.CreateMul(InOffset, InStep); 912 InPtr = Builder.CreateInBoundsGEP(InBufPtr, InOffset); 913 InPtr = Builder.CreatePointerCast(InPtr, InTy); 914 } 915 916 if (InPtr) { 917 RootArgs.push_back(InPtr); 918 } 919 920 if (OutPtr) { 921 RootArgs.push_back(OutPtr); 922 } 923 924 if (UsrData) { 925 RootArgs.push_back(UsrData); 926 } 927 928 finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder); 929 930 Builder.CreateCall(Function, RootArgs); 931 932 return true; 933 } 934 935 /* Expand a pass-by-value foreach kernel. 936 */ 937 bool ExpandForEach(llvm::Function *Function, uint32_t Signature) { 938 bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)); 939 ALOGV("Expanding kernel Function %s", Function->getName().str().c_str()); 940 941 // TODO: Refactor this to share functionality with ExpandOldStyleForEach. 942 llvm::DataLayout DL(Module); 943 if (Module->getTargetTriple() == DEFAULT_X86_TRIPLE_STRING) { 944 DL.reset(X86_CUSTOM_DL_STRING); 945 } 946 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 947 948 llvm::Function *ExpandedFunction = 949 createEmptyExpandedForEachKernel(Function->getName()); 950 951 /* 952 * Extract the expanded function's parameters. It is guaranteed by 953 * createEmptyExpandedForEachKernel that there will be four parameters. 954 */ 955 956 bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams); 957 958 llvm::Function::arg_iterator ExpandedFunctionArgIter = 959 ExpandedFunction->arg_begin(); 960 961 llvm::Value *Arg_p = &*(ExpandedFunctionArgIter++); 962 llvm::Value *Arg_x1 = &*(ExpandedFunctionArgIter++); 963 llvm::Value *Arg_x2 = &*(ExpandedFunctionArgIter++); 964 // Arg_outstep is not used by expanded new-style forEach kernels. 965 966 // Construct the actual function body. 967 llvm::IRBuilder<> Builder(&*ExpandedFunction->getEntryBlock().begin()); 968 969 // Create TBAA meta-data. 970 llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, 971 *TBAAAllocation, *TBAAPointer; 972 llvm::MDBuilder MDHelper(*Context); 973 974 TBAARenderScriptDistinct = 975 MDHelper.createTBAARoot(kRenderScriptTBAARootName); 976 TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName, 977 TBAARenderScriptDistinct); 978 TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation", 979 TBAARenderScript); 980 TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation, 981 TBAAAllocation, 0); 982 TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer", 983 TBAARenderScript); 984 TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0); 985 986 /* 987 * Collect and construct the arguments for the kernel(). 988 * 989 * Note that we load any loop-invariant arguments before entering the Loop. 990 */ 991 size_t NumRemainingInputs = Function->arg_size(); 992 993 // No usrData parameter on kernels. 994 bccAssert( 995 !bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)); 996 997 llvm::Function::arg_iterator ArgIter = Function->arg_begin(); 998 999 // Check the return type 1000 llvm::Type *OutTy = nullptr; 1001 llvm::LoadInst *OutBasePtr = nullptr; 1002 llvm::Value *CastedOutBasePtr = nullptr; 1003 1004 bool PassOutByPointer = false; 1005 1006 if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) { 1007 llvm::Type *OutBaseTy = Function->getReturnType(); 1008 1009 if (OutBaseTy->isVoidTy()) { 1010 PassOutByPointer = true; 1011 OutTy = ArgIter->getType(); 1012 1013 ArgIter++; 1014 --NumRemainingInputs; 1015 } else { 1016 // We don't increment Args, since we are using the actual return type. 1017 OutTy = OutBaseTy->getPointerTo(); 1018 } 1019 1020 SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0})); 1021 OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep")); 1022 1023 if (gEnableRsTbaa) { 1024 OutBasePtr->setMetadata("tbaa", TBAAPointer); 1025 } 1026 1027 if (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING) { 1028 CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out"); 1029 } else { 1030 // The disagreement between module and x86 target machine datalayout 1031 // causes mismatched input/output data offset between slang reflected 1032 // code and bcc codegen for GetElementPtr. To solve this issue, skip the 1033 // cast to OutTy and leave CastedOutBasePtr as an int8_t*. The buffer 1034 // is later indexed with an explicit byte offset computed based on 1035 // X86_CUSTOM_DL_STRING and then bitcast it to actual output type. 1036 CastedOutBasePtr = OutBasePtr; 1037 } 1038 } 1039 1040 llvm::SmallVector<llvm::Type*, 8> InTypes; 1041 llvm::SmallVector<llvm::Value*, 8> InBufPtrs; 1042 llvm::SmallVector<llvm::Value*, 8> InStructTempSlots; 1043 1044 bccAssert(NumRemainingInputs <= RS_KERNEL_INPUT_LIMIT); 1045 1046 // Create the loop structure. 1047 llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock(); 1048 llvm::Value *IV; 1049 createLoop(Builder, Arg_x1, Arg_x2, &IV); 1050 1051 llvm::SmallVector<llvm::Value*, 8> CalleeArgs; 1052 const int CalleeArgsContextIdx = 1053 ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs, 1054 [&NumRemainingInputs]() { --NumRemainingInputs; }, 1055 LoopHeader->getTerminator()); 1056 1057 // After ExpandSpecialArguments() gets called, NumRemainingInputs 1058 // counts the number of arguments to the kernel that correspond to 1059 // an array entry from the InPtr field of the DriverInfo 1060 // structure. 1061 const size_t NumInPtrArguments = NumRemainingInputs; 1062 1063 if (NumInPtrArguments > 0) { 1064 ExpandInputsLoopInvariant(Builder, LoopHeader, Arg_p, TBAAPointer, ArgIter, NumInPtrArguments, 1065 InTypes, InBufPtrs, InStructTempSlots); 1066 } 1067 1068 // Populate the actual call to kernel(). 1069 llvm::SmallVector<llvm::Value*, 8> RootArgs; 1070 1071 // Calculate the current input and output pointers. 1072 1073 // Output 1074 1075 llvm::Value *OutPtr = nullptr; 1076 if (CastedOutBasePtr) { 1077 llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1); 1078 1079 if (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING) { 1080 OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffset); 1081 } else { 1082 // Treat x86 output buffer as byte[], get indexed pointer with explicit 1083 // byte offset computed using a datalayout based on 1084 // X86_CUSTOM_DL_STRING, then bitcast it to actual output type. 1085 uint64_t OutStep = DL.getTypeAllocSize(OutTy->getPointerElementType()); 1086 llvm::Value *OutOffsetInBytes = Builder.CreateMul(OutOffset, llvm::ConstantInt::get(Int32Ty, OutStep)); 1087 OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffsetInBytes); 1088 OutPtr = Builder.CreatePointerCast(OutPtr, OutTy); 1089 } 1090 1091 if (PassOutByPointer) { 1092 RootArgs.push_back(OutPtr); 1093 } 1094 } 1095 1096 // Inputs 1097 1098 if (NumInPtrArguments > 0) { 1099 ExpandInputsBody(Builder, Arg_x1, TBAAAllocation, NumInPtrArguments, 1100 InTypes, InBufPtrs, InStructTempSlots, IV, RootArgs); 1101 } 1102 1103 finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder); 1104 1105 llvm::Value *RetVal = Builder.CreateCall(Function, RootArgs); 1106 1107 if (OutPtr && !PassOutByPointer) { 1108 RetVal->setName("call.result"); 1109 llvm::StoreInst *Store = Builder.CreateStore(RetVal, OutPtr); 1110 if (gEnableRsTbaa) { 1111 Store->setMetadata("tbaa", TBAAAllocation); 1112 } 1113 } 1114 1115 return true; 1116 } 1117 1118 // Expand a simple reduce-style kernel function. 1119 // 1120 // The input is a kernel which represents a binary operation, 1121 // of the form 1122 // 1123 // define foo @func(foo %a, foo %b), 1124 // 1125 // (More generally, it can be of the forms 1126 // 1127 // define void @func(foo* %ret, foo* %a, foo* %b) 1128 // define void @func(foo* %ret, foo1 %a, foo1 %b) 1129 // define foo1 @func(foo2 %a, foo2 %b) 1130 // 1131 // as a result of argument / return value conversions. Here, "foo1" 1132 // and "foo2" refer to possibly coerced types, and the coerced 1133 // argument type may be different from the coerced return type. See 1134 // "Note on coercion" below.) 1135 // 1136 // Note also, we do not expect to encounter any case when the 1137 // arguments are promoted to pointers but the return value is 1138 // unpromoted to pointer, e.g. 1139 // 1140 // define foo1 @func(foo* %a, foo* %b) 1141 // 1142 // and we will throw an assertion in this case.) 1143 // 1144 // The input kernel gets expanded into a kernel of the form 1145 // 1146 // define void @func.expand(i8* %inBuf, i8* outBuf, i32 len) 1147 // 1148 // which performs a serial reduction of `len` elements from `inBuf`, 1149 // and stores the result into `outBuf`. In pseudocode, @func.expand 1150 // does: 1151 // 1152 // inArr := (foo *)inBuf; 1153 // accum := inArr[0]; 1154 // for (i := 1; i < len; ++i) { 1155 // accum := foo(accum, inArr[i]); 1156 // } 1157 // *(foo *)outBuf := accum; 1158 // 1159 // Note on coercion 1160 // 1161 // Both the return value and the argument types may undergo internal 1162 // coercion in clang as part of call lowering. As a result, the 1163 // return value type may differ from the argument type even if the 1164 // types in the RenderScript signaure are the same. For instance, the 1165 // kernel 1166 // 1167 // int3 add(int3 a, int3 b) { return a + b; } 1168 // 1169 // gets lowered by clang as 1170 // 1171 // define <3 x i32> @add(<4 x i32> %a.coerce, <4 x i32> %b.coerce) 1172 // 1173 // under AArch64. The details of this process are found in clang, 1174 // lib/CodeGen/TargetInfo.cpp, under classifyArgumentType() and 1175 // classifyReturnType() in ARMABIInfo, AArch64ABIInfo. If the value 1176 // is passed by pointer, then the pointed-to type is not coerced. 1177 // 1178 // Since we lack the original type information, this code does loads 1179 // and stores of allocation data by way of pointers to the coerced 1180 // type. 1181 bool ExpandReduce(llvm::Function *Function) { 1182 bccAssert(Function); 1183 1184 ALOGV("Expanding simple reduce kernel %s", Function->getName().str().c_str()); 1185 1186 llvm::DataLayout DL(Module); 1187 if (Module->getTargetTriple() == DEFAULT_X86_TRIPLE_STRING) { 1188 DL.reset(X86_CUSTOM_DL_STRING); 1189 } 1190 1191 // TBAA Metadata 1192 llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, *TBAAAllocation; 1193 llvm::MDBuilder MDHelper(*Context); 1194 1195 TBAARenderScriptDistinct = 1196 MDHelper.createTBAARoot(kRenderScriptTBAARootName); 1197 TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName, 1198 TBAARenderScriptDistinct); 1199 TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation", 1200 TBAARenderScript); 1201 TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation, 1202 TBAAAllocation, 0); 1203 1204 llvm::Function *ExpandedFunction = 1205 createEmptyExpandedReduceKernel(Function->getName()); 1206 1207 // Extract the expanded kernel's parameters. It is guaranteed by 1208 // createEmptyExpandedReduceKernel that there will be 3 parameters. 1209 auto ExpandedFunctionArgIter = ExpandedFunction->arg_begin(); 1210 1211 llvm::Value *Arg_inBuf = &*(ExpandedFunctionArgIter++); 1212 llvm::Value *Arg_outBuf = &*(ExpandedFunctionArgIter++); 1213 llvm::Value *Arg_len = &*(ExpandedFunctionArgIter++); 1214 1215 bccAssert(Function->arg_size() == 2 || Function->arg_size() == 3); 1216 1217 // Check if, instead of returning a value, the original kernel has 1218 // a pointer parameter which points to a temporary buffer into 1219 // which the return value gets written. 1220 const bool ReturnValuePointerStyle = (Function->arg_size() == 3); 1221 bccAssert(Function->getReturnType()->isVoidTy() == ReturnValuePointerStyle); 1222 1223 // Check if, instead of being passed by value, the inputs to the 1224 // original kernel are passed by pointer. 1225 auto FirstArgIter = Function->arg_begin(); 1226 // The second argument is always an input to the original kernel. 1227 auto SecondArgIter = std::next(FirstArgIter); 1228 const bool InputsPointerStyle = SecondArgIter->getType()->isPointerTy(); 1229 1230 // Get the output type (i.e. return type of the original kernel). 1231 llvm::PointerType *OutPtrTy = nullptr; 1232 llvm::Type *OutTy = nullptr; 1233 if (ReturnValuePointerStyle) { 1234 OutPtrTy = llvm::dyn_cast<llvm::PointerType>(FirstArgIter->getType()); 1235 bccAssert(OutPtrTy && "Expected a pointer parameter to kernel"); 1236 OutTy = OutPtrTy->getElementType(); 1237 } else { 1238 OutTy = Function->getReturnType(); 1239 bccAssert(!OutTy->isVoidTy()); 1240 OutPtrTy = OutTy->getPointerTo(); 1241 } 1242 1243 // Get the input type (type of the arguments to the original 1244 // kernel). Some input types are different from the output type, 1245 // due to explicit coercion that the compiler performs when 1246 // lowering the parameters. See "Note on coercion" above. 1247 llvm::PointerType *InPtrTy; 1248 llvm::Type *InTy; 1249 if (InputsPointerStyle) { 1250 InPtrTy = llvm::dyn_cast<llvm::PointerType>(SecondArgIter->getType()); 1251 bccAssert(InPtrTy && "Expected a pointer parameter to kernel"); 1252 bccAssert(ReturnValuePointerStyle); 1253 bccAssert(std::next(SecondArgIter)->getType() == InPtrTy && 1254 "Input type mismatch"); 1255 InTy = InPtrTy->getElementType(); 1256 } else { 1257 InTy = SecondArgIter->getType(); 1258 InPtrTy = InTy->getPointerTo(); 1259 if (!ReturnValuePointerStyle) { 1260 bccAssert(InTy == FirstArgIter->getType() && "Input type mismatch"); 1261 } else { 1262 bccAssert(InTy == std::next(SecondArgIter)->getType() && 1263 "Input type mismatch"); 1264 } 1265 } 1266 1267 // The input type should take up the same amount of space in 1268 // memory as the output type. 1269 bccAssert(DL.getTypeAllocSize(InTy) == DL.getTypeAllocSize(OutTy)); 1270 1271 // Construct the actual function body. 1272 llvm::IRBuilder<> Builder(&*ExpandedFunction->getEntryBlock().begin()); 1273 1274 // Cast input and output buffers to appropriate types. 1275 llvm::Value *InBuf = Builder.CreatePointerCast(Arg_inBuf, InPtrTy); 1276 llvm::Value *OutBuf = Builder.CreatePointerCast(Arg_outBuf, OutPtrTy); 1277 1278 // Create a slot to pass temporary results back. This needs to be 1279 // separate from the accumulator slot because the kernel may mark 1280 // the return value slot as noalias. 1281 llvm::Value *ReturnBuf = nullptr; 1282 if (ReturnValuePointerStyle) { 1283 ReturnBuf = Builder.CreateAlloca(OutTy, nullptr, "ret.tmp"); 1284 } 1285 1286 // Create a slot to hold the second input if the inputs are passed 1287 // by pointer to the original kernel. We cannot directly pass a 1288 // pointer to the input buffer, because the kernel may modify its 1289 // inputs. 1290 llvm::Value *SecondInputTempBuf = nullptr; 1291 if (InputsPointerStyle) { 1292 SecondInputTempBuf = Builder.CreateAlloca(InTy, nullptr, "in.tmp"); 1293 } 1294 1295 // Create a slot to accumulate temporary results, and fill it with 1296 // the first value. 1297 llvm::Value *AccumBuf = Builder.CreateAlloca(OutTy, nullptr, "accum"); 1298 // Cast to OutPtrTy before loading, since AccumBuf has type OutPtrTy. 1299 llvm::LoadInst *FirstElementLoad = Builder.CreateLoad( 1300 Builder.CreatePointerCast(InBuf, OutPtrTy)); 1301 if (gEnableRsTbaa) { 1302 FirstElementLoad->setMetadata("tbaa", TBAAAllocation); 1303 } 1304 // Memory operations with AccumBuf shouldn't be marked with 1305 // RenderScript TBAA, since this might conflict with TBAA metadata 1306 // in the kernel function when AccumBuf is passed by pointer. 1307 Builder.CreateStore(FirstElementLoad, AccumBuf); 1308 1309 // Loop body 1310 1311 // Create the loop structure. Note that the first input in the input buffer 1312 // has already been accumulated, so that we start at index 1. 1313 llvm::Value *IndVar; 1314 llvm::Value *Start = llvm::ConstantInt::get(Arg_len->getType(), 1); 1315 llvm::BasicBlock *Exit = createLoop(Builder, Start, Arg_len, &IndVar); 1316 1317 llvm::Value *InputPtr = Builder.CreateInBoundsGEP(InBuf, IndVar, "next_input.gep"); 1318 1319 // Set up arguments and call the original (unexpanded) kernel. 1320 // 1321 // The original kernel can have at most 3 arguments, which is 1322 // achieved when the signature looks like: 1323 // 1324 // define void @func(foo* %ret, bar %a, bar %b) 1325 // 1326 // (bar can be one of foo/foo.coerce/foo*). 1327 llvm::SmallVector<llvm::Value *, 3> KernelArgs; 1328 1329 if (ReturnValuePointerStyle) { 1330 KernelArgs.push_back(ReturnBuf); 1331 } 1332 1333 if (InputsPointerStyle) { 1334 bccAssert(ReturnValuePointerStyle); 1335 // Because the return buffer is copied back into the 1336 // accumulator, it's okay if the accumulator is overwritten. 1337 KernelArgs.push_back(AccumBuf); 1338 1339 llvm::LoadInst *InputLoad = Builder.CreateLoad(InputPtr); 1340 if (gEnableRsTbaa) { 1341 InputLoad->setMetadata("tbaa", TBAAAllocation); 1342 } 1343 Builder.CreateStore(InputLoad, SecondInputTempBuf); 1344 1345 KernelArgs.push_back(SecondInputTempBuf); 1346 } else { 1347 // InPtrTy may be different from OutPtrTy (the type of 1348 // AccumBuf), so first cast the accumulator buffer to the 1349 // pointer type corresponding to the input argument type. 1350 KernelArgs.push_back( 1351 Builder.CreateLoad(Builder.CreatePointerCast(AccumBuf, InPtrTy))); 1352 1353 llvm::LoadInst *LoadedArg = Builder.CreateLoad(InputPtr); 1354 if (gEnableRsTbaa) { 1355 LoadedArg->setMetadata("tbaa", TBAAAllocation); 1356 } 1357 KernelArgs.push_back(LoadedArg); 1358 } 1359 1360 llvm::Value *RetVal = Builder.CreateCall(Function, KernelArgs); 1361 1362 const uint64_t ElementSize = DL.getTypeStoreSize(OutTy); 1363 const uint64_t ElementAlign = DL.getABITypeAlignment(OutTy); 1364 1365 // Store the output in the accumulator. 1366 if (ReturnValuePointerStyle) { 1367 Builder.CreateMemCpy(AccumBuf, ReturnBuf, ElementSize, ElementAlign); 1368 } else { 1369 Builder.CreateStore(RetVal, AccumBuf); 1370 } 1371 1372 // Loop exit 1373 Builder.SetInsertPoint(Exit, Exit->begin()); 1374 1375 llvm::LoadInst *OutputLoad = Builder.CreateLoad(AccumBuf); 1376 llvm::StoreInst *OutputStore = Builder.CreateStore(OutputLoad, OutBuf); 1377 if (gEnableRsTbaa) { 1378 OutputStore->setMetadata("tbaa", TBAAAllocation); 1379 } 1380 1381 return true; 1382 } 1383 1384 // Certain categories of functions that make up a general 1385 // reduce-style kernel are called directly from the driver with no 1386 // expansion needed. For a function in such a category, we need to 1387 // promote linkage from static to external, to ensure that the 1388 // function is visible to the driver in the dynamic symbol table. 1389 // This promotion is safe because we don't have any kind of cross 1390 // translation unit linkage model (except for linking against 1391 // RenderScript libraries), so we do not risk name clashes. 1392 bool PromoteReduceNewFunction(const char *Name, FunctionSet &PromotedFunctions) { 1393 if (!Name) // a presumably-optional function that is not present 1394 return false; 1395 1396 llvm::Function *Fn = Module->getFunction(Name); 1397 bccAssert(Fn != nullptr); 1398 if (PromotedFunctions.insert(Fn).second) { 1399 bccAssert(Fn->getLinkage() == llvm::GlobalValue::InternalLinkage); 1400 Fn->setLinkage(llvm::GlobalValue::ExternalLinkage); 1401 return true; 1402 } 1403 1404 return false; 1405 } 1406 1407 // Expand the accumulator function for a general reduce-style kernel. 1408 // 1409 // The input is a function of the form 1410 // 1411 // define void @func(accumType* %accum, foo1 in1[, ... fooN inN] [, special arguments]) 1412 // 1413 // where all arguments except the first are the same as for a foreach kernel. 1414 // 1415 // The input accumulator function gets expanded into a function of the form 1416 // 1417 // define void @func.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, accumType* %accum) 1418 // 1419 // which performs a serial accumulaion of elements [x1, x2) into *%accum. 1420 // 1421 // In pseudocode, @func.expand does: 1422 // 1423 // for (i = %x1; i < %x2; ++i) { 1424 // func(%accum, 1425 // *((foo1 *)p->inPtr[0] + i)[, ... *((fooN *)p->inPtr[N-1] + i) 1426 // [, p] [, i] [, p->current.y] [, p->current.z]); 1427 // } 1428 // 1429 // This is very similar to foreach kernel expansion with no output. 1430 bool ExpandReduceNewAccumulator(llvm::Function *FnAccumulator, uint32_t Signature, size_t NumInputs) { 1431 ALOGV("Expanding accumulator %s for general reduce kernel", 1432 FnAccumulator->getName().str().c_str()); 1433 1434 // Create TBAA meta-data. 1435 llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, 1436 *TBAAAllocation, *TBAAPointer; 1437 llvm::MDBuilder MDHelper(*Context); 1438 TBAARenderScriptDistinct = 1439 MDHelper.createTBAARoot(kRenderScriptTBAARootName); 1440 TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName, 1441 TBAARenderScriptDistinct); 1442 TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation", 1443 TBAARenderScript); 1444 TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation, 1445 TBAAAllocation, 0); 1446 TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer", 1447 TBAARenderScript); 1448 TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0); 1449 1450 auto AccumulatorArgIter = FnAccumulator->arg_begin(); 1451 1452 // Create empty accumulator function. 1453 llvm::Function *FnExpandedAccumulator = 1454 createEmptyExpandedReduceNewAccumulator(FnAccumulator->getName(), 1455 (AccumulatorArgIter++)->getType()); 1456 1457 // Extract the expanded accumulator's parameters. It is 1458 // guaranteed by createEmptyExpandedReduceNewAccumulator that 1459 // there will be 4 parameters. 1460 bccAssert(FnExpandedAccumulator->arg_size() == kNumExpandedReduceNewAccumulatorParams); 1461 auto ExpandedAccumulatorArgIter = FnExpandedAccumulator->arg_begin(); 1462 llvm::Value *Arg_p = &*(ExpandedAccumulatorArgIter++); 1463 llvm::Value *Arg_x1 = &*(ExpandedAccumulatorArgIter++); 1464 llvm::Value *Arg_x2 = &*(ExpandedAccumulatorArgIter++); 1465 llvm::Value *Arg_accum = &*(ExpandedAccumulatorArgIter++); 1466 1467 // Construct the actual function body. 1468 llvm::IRBuilder<> Builder(&*FnExpandedAccumulator->getEntryBlock().begin()); 1469 1470 // Create the loop structure. 1471 llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock(); 1472 llvm::Value *IndVar; 1473 createLoop(Builder, Arg_x1, Arg_x2, &IndVar); 1474 1475 llvm::SmallVector<llvm::Value*, 8> CalleeArgs; 1476 const int CalleeArgsContextIdx = 1477 ExpandSpecialArguments(Signature, IndVar, Arg_p, Builder, CalleeArgs, 1478 [](){}, LoopHeader->getTerminator()); 1479 1480 llvm::SmallVector<llvm::Type*, 8> InTypes; 1481 llvm::SmallVector<llvm::Value*, 8> InBufPtrs; 1482 llvm::SmallVector<llvm::Value*, 8> InStructTempSlots; 1483 ExpandInputsLoopInvariant(Builder, LoopHeader, Arg_p, TBAAPointer, AccumulatorArgIter, NumInputs, 1484 InTypes, InBufPtrs, InStructTempSlots); 1485 1486 // Populate the actual call to the original accumulator. 1487 llvm::SmallVector<llvm::Value*, 8> RootArgs; 1488 RootArgs.push_back(Arg_accum); 1489 ExpandInputsBody(Builder, Arg_x1, TBAAAllocation, NumInputs, InTypes, InBufPtrs, InStructTempSlots, 1490 IndVar, RootArgs); 1491 finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *FnAccumulator, Builder); 1492 Builder.CreateCall(FnAccumulator, RootArgs); 1493 1494 return true; 1495 } 1496 1497 // Create a combiner function for a general reduce-style kernel that lacks one, 1498 // by calling the accumulator function. 1499 // 1500 // The accumulator function must be of the form 1501 // 1502 // define void @accumFn(accumType* %accum, accumType %in) 1503 // 1504 // A combiner function will be generated of the form 1505 // 1506 // define void @accumFn.combiner(accumType* %accum, accumType* %other) { 1507 // %1 = load accumType, accumType* %other 1508 // call void @accumFn(accumType* %accum, accumType %1); 1509 // } 1510 bool CreateReduceNewCombinerFromAccumulator(llvm::Function *FnAccumulator) { 1511 ALOGV("Creating combiner from accumulator %s for general reduce kernel", 1512 FnAccumulator->getName().str().c_str()); 1513 1514 using llvm::Attribute; 1515 1516 bccAssert(FnAccumulator->arg_size() == 2); 1517 auto AccumulatorArgIter = FnAccumulator->arg_begin(); 1518 llvm::Value *AccumulatorArg_accum = &*(AccumulatorArgIter++); 1519 llvm::Value *AccumulatorArg_in = &*(AccumulatorArgIter++); 1520 llvm::Type *AccumulatorArgType = AccumulatorArg_accum->getType(); 1521 bccAssert(AccumulatorArgType->isPointerTy()); 1522 1523 llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context); 1524 llvm::FunctionType *CombinerType = 1525 llvm::FunctionType::get(VoidTy, { AccumulatorArgType, AccumulatorArgType }, false); 1526 llvm::Function *FnCombiner = 1527 llvm::Function::Create(CombinerType, llvm::GlobalValue::ExternalLinkage, 1528 nameReduceNewCombinerFromAccumulator(FnAccumulator->getName()), 1529 Module); 1530 1531 auto CombinerArgIter = FnCombiner->arg_begin(); 1532 1533 llvm::Argument *CombinerArg_accum = &(*CombinerArgIter++); 1534 CombinerArg_accum->setName("accum"); 1535 CombinerArg_accum->addAttr(llvm::AttributeSet::get(*Context, CombinerArg_accum->getArgNo() + 1, 1536 llvm::makeArrayRef(Attribute::NoCapture))); 1537 1538 llvm::Argument *CombinerArg_other = &(*CombinerArgIter++); 1539 CombinerArg_other->setName("other"); 1540 CombinerArg_other->addAttr(llvm::AttributeSet::get(*Context, CombinerArg_other->getArgNo() + 1, 1541 llvm::makeArrayRef(Attribute::NoCapture))); 1542 1543 llvm::BasicBlock *BB = llvm::BasicBlock::Create(*Context, "BB", FnCombiner); 1544 llvm::IRBuilder<> Builder(BB); 1545 1546 if (AccumulatorArg_in->getType()->isPointerTy()) { 1547 // Types of sufficient size get passed by pointer-to-copy rather 1548 // than passed by value. An accumulator cannot take a pointer 1549 // at the user level; so if we see a pointer here, we know that 1550 // we have a pass-by-pointer-to-copy case. 1551 llvm::Type *ElementType = AccumulatorArg_in->getType()->getPointerElementType(); 1552 llvm::Value *TempMem = Builder.CreateAlloca(ElementType, nullptr, "caller_copy"); 1553 Builder.CreateStore(Builder.CreateLoad(CombinerArg_other), TempMem); 1554 Builder.CreateCall(FnAccumulator, { CombinerArg_accum, TempMem }); 1555 } else { 1556 llvm::Value *TypeAdjustedOther = CombinerArg_other; 1557 if (AccumulatorArgType->getPointerElementType() != AccumulatorArg_in->getType()) { 1558 // Call lowering by frontend has done some type coercion 1559 TypeAdjustedOther = Builder.CreatePointerCast(CombinerArg_other, 1560 AccumulatorArg_in->getType()->getPointerTo(), 1561 "cast"); 1562 } 1563 llvm::Value *DerefOther = Builder.CreateLoad(TypeAdjustedOther); 1564 Builder.CreateCall(FnAccumulator, { CombinerArg_accum, DerefOther }); 1565 } 1566 Builder.CreateRetVoid(); 1567 1568 return true; 1569 } 1570 1571 /// @brief Checks if pointers to allocation internals are exposed 1572 /// 1573 /// This function verifies if through the parameters passed to the kernel 1574 /// or through calls to the runtime library the script gains access to 1575 /// pointers pointing to data within a RenderScript Allocation. 1576 /// If we know we control all loads from and stores to data within 1577 /// RenderScript allocations and if we know the run-time internal accesses 1578 /// are all annotated with RenderScript TBAA metadata, only then we 1579 /// can safely use TBAA to distinguish between generic and from-allocation 1580 /// pointers. 1581 bool allocPointersExposed(llvm::Module &Module) { 1582 // Old style kernel function can expose pointers to elements within 1583 // allocations. 1584 // TODO: Extend analysis to allow simple cases of old-style kernels. 1585 for (size_t i = 0; i < mExportForEachCount; ++i) { 1586 const char *Name = mExportForEachNameList[i]; 1587 uint32_t Signature = mExportForEachSignatureList[i]; 1588 if (Module.getFunction(Name) && 1589 !bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)) { 1590 return true; 1591 } 1592 } 1593 1594 // Check for library functions that expose a pointer to an Allocation or 1595 // that are not yet annotated with RenderScript-specific tbaa information. 1596 static const std::vector<const char *> Funcs{ 1597 // rsGetElementAt(...) 1598 "_Z14rsGetElementAt13rs_allocationj", 1599 "_Z14rsGetElementAt13rs_allocationjj", 1600 "_Z14rsGetElementAt13rs_allocationjjj", 1601 1602 // rsSetElementAt() 1603 "_Z14rsSetElementAt13rs_allocationPvj", 1604 "_Z14rsSetElementAt13rs_allocationPvjj", 1605 "_Z14rsSetElementAt13rs_allocationPvjjj", 1606 1607 // rsGetElementAtYuv_uchar_Y() 1608 "_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj", 1609 1610 // rsGetElementAtYuv_uchar_U() 1611 "_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj", 1612 1613 // rsGetElementAtYuv_uchar_V() 1614 "_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj", 1615 }; 1616 1617 for (auto FI : Funcs) { 1618 llvm::Function *Function = Module.getFunction(FI); 1619 1620 if (!Function) { 1621 ALOGE("Missing run-time function '%s'", FI); 1622 return true; 1623 } 1624 1625 if (Function->getNumUses() > 0) { 1626 return true; 1627 } 1628 } 1629 1630 return false; 1631 } 1632 1633 /// @brief Connect RenderScript TBAA metadata to C/C++ metadata 1634 /// 1635 /// The TBAA metadata used to annotate loads/stores from RenderScript 1636 /// Allocations is generated in a separate TBAA tree with a 1637 /// "RenderScript Distinct TBAA" root node. LLVM does assume may-alias for 1638 /// all nodes in unrelated alias analysis trees. This function makes the 1639 /// "RenderScript TBAA" node (which is parented by the Distinct TBAA root), 1640 /// a subtree of the normal C/C++ TBAA tree aside of normal C/C++ types. With 1641 /// the connected trees every access to an Allocation is resolved to 1642 /// must-alias if compared to a normal C/C++ access. 1643 void connectRenderScriptTBAAMetadata(llvm::Module &Module) { 1644 llvm::MDBuilder MDHelper(*Context); 1645 llvm::MDNode *TBAARenderScriptDistinct = 1646 MDHelper.createTBAARoot("RenderScript Distinct TBAA"); 1647 llvm::MDNode *TBAARenderScript = MDHelper.createTBAANode( 1648 "RenderScript TBAA", TBAARenderScriptDistinct); 1649 llvm::MDNode *TBAARoot = MDHelper.createTBAARoot("Simple C/C++ TBAA"); 1650 TBAARenderScript->replaceOperandWith(1, TBAARoot); 1651 } 1652 1653 virtual bool runOnModule(llvm::Module &Module) { 1654 bool Changed = false; 1655 this->Module = &Module; 1656 Context = &Module.getContext(); 1657 1658 buildTypes(); 1659 1660 bcinfo::MetadataExtractor me(&Module); 1661 if (!me.extract()) { 1662 ALOGE("Could not extract metadata from module!"); 1663 return false; 1664 } 1665 1666 // Expand forEach_* style kernels. 1667 mExportForEachCount = me.getExportForEachSignatureCount(); 1668 mExportForEachNameList = me.getExportForEachNameList(); 1669 mExportForEachSignatureList = me.getExportForEachSignatureList(); 1670 1671 for (size_t i = 0; i < mExportForEachCount; ++i) { 1672 const char *name = mExportForEachNameList[i]; 1673 uint32_t signature = mExportForEachSignatureList[i]; 1674 llvm::Function *kernel = Module.getFunction(name); 1675 if (kernel) { 1676 if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) { 1677 Changed |= ExpandForEach(kernel, signature); 1678 kernel->setLinkage(llvm::GlobalValue::InternalLinkage); 1679 } else if (kernel->getReturnType()->isVoidTy()) { 1680 Changed |= ExpandOldStyleForEach(kernel, signature); 1681 kernel->setLinkage(llvm::GlobalValue::InternalLinkage); 1682 } else { 1683 // There are some graphics root functions that are not 1684 // expanded, but that will be called directly. For those 1685 // functions, we can not set the linkage to internal. 1686 } 1687 } 1688 } 1689 1690 // Expand simple reduce_* style kernels. 1691 mExportReduceCount = me.getExportReduceCount(); 1692 mExportReduceNameList = me.getExportReduceNameList(); 1693 1694 for (size_t i = 0; i < mExportReduceCount; ++i) { 1695 llvm::Function *kernel = Module.getFunction(mExportReduceNameList[i]); 1696 if (kernel) { 1697 Changed |= ExpandReduce(kernel); 1698 } 1699 } 1700 1701 // Process general reduce_* style functions. 1702 const size_t ExportReduceNewCount = me.getExportReduceNewCount(); 1703 const bcinfo::MetadataExtractor::ReduceNew *ExportReduceNewList = me.getExportReduceNewList(); 1704 // Note that functions can be shared between kernels 1705 FunctionSet PromotedFunctions, ExpandedAccumulators, AccumulatorsForCombiners; 1706 1707 for (size_t i = 0; i < ExportReduceNewCount; ++i) { 1708 Changed |= PromoteReduceNewFunction(ExportReduceNewList[i].mInitializerName, PromotedFunctions); 1709 Changed |= PromoteReduceNewFunction(ExportReduceNewList[i].mCombinerName, PromotedFunctions); 1710 Changed |= PromoteReduceNewFunction(ExportReduceNewList[i].mOutConverterName, PromotedFunctions); 1711 1712 // Accumulator 1713 llvm::Function *accumulator = Module.getFunction(ExportReduceNewList[i].mAccumulatorName); 1714 bccAssert(accumulator != nullptr); 1715 if (ExpandedAccumulators.insert(accumulator).second) 1716 Changed |= ExpandReduceNewAccumulator(accumulator, 1717 ExportReduceNewList[i].mSignature, 1718 ExportReduceNewList[i].mInputCount); 1719 if (!ExportReduceNewList[i].mCombinerName) { 1720 if (AccumulatorsForCombiners.insert(accumulator).second) 1721 Changed |= CreateReduceNewCombinerFromAccumulator(accumulator); 1722 } 1723 } 1724 1725 if (gEnableRsTbaa && !allocPointersExposed(Module)) { 1726 connectRenderScriptTBAAMetadata(Module); 1727 } 1728 1729 return Changed; 1730 } 1731 1732 virtual const char *getPassName() const { 1733 return "forEach_* and reduce_* function expansion"; 1734 } 1735 1736}; // end RSKernelExpandPass 1737 1738} // end anonymous namespace 1739 1740char RSKernelExpandPass::ID = 0; 1741static llvm::RegisterPass<RSKernelExpandPass> X("kernelexp", "Kernel Expand Pass"); 1742 1743namespace bcc { 1744 1745const char BCC_INDEX_VAR_NAME[] = "rsIndex"; 1746 1747llvm::ModulePass * 1748createRSKernelExpandPass(bool pEnableStepOpt) { 1749 return new RSKernelExpandPass(pEnableStepOpt); 1750} 1751 1752} // end namespace bcc 1753