RSKernelExpand.cpp revision dfde70a8ae9b77bbf0e8d9d22a55e1d1fda7d64d
1/*
2 * Copyright 2012, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "bcc/Assert.h"
18#include "bcc/Renderscript/RSTransforms.h"
19
20#include <cstdlib>
21#include <functional>
22
23#include <llvm/IR/DerivedTypes.h>
24#include <llvm/IR/Function.h>
25#include <llvm/IR/Instructions.h>
26#include <llvm/IR/IRBuilder.h>
27#include <llvm/IR/MDBuilder.h>
28#include <llvm/IR/Module.h>
29#include <llvm/Pass.h>
30#include <llvm/Support/raw_ostream.h>
31#include <llvm/IR/DataLayout.h>
32#include <llvm/IR/Function.h>
33#include <llvm/IR/Type.h>
34#include <llvm/Transforms/Utils/BasicBlockUtils.h>
35
36#include "bcc/Config/Config.h"
37#include "bcc/Support/Log.h"
38
39#include "bcinfo/MetadataExtractor.h"
40
41#ifndef __DISABLE_ASSERTS
42// Only used in bccAssert()
43const int kNumExpandedForeachParams = 4;
44const int kNumExpandedReduceParams = 3;
45#endif
46
47const char kRenderScriptTBAARootName[] = "RenderScript Distinct TBAA";
48const char kRenderScriptTBAANodeName[] = "RenderScript TBAA";
49
50using namespace bcc;
51
52namespace {
53
54static const bool gEnableRsTbaa = true;
55
56/* RSKernelExpandPass - This pass operates on functions that are able
57 * to be called via rsForEach(), "foreach_<NAME>", or
58 * "reduce_<NAME>". We create an inner loop for the function to be
59 * invoked over the appropriate data cells of the input/output
60 * allocations (adjusting other relevant parameters as we go). We
61 * support doing this for any forEach or reduce style compute
62 * kernels. The new function name is the original function name
63 * followed by ".expand". Note that we still generate code for the
64 * original function.
65 */
66class RSKernelExpandPass : public llvm::ModulePass {
67public:
68  static char ID;
69
70private:
71  static const size_t RS_KERNEL_INPUT_LIMIT = 8; // see frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h
72
73  enum RsLaunchDimensionsField {
74    RsLaunchDimensionsFieldX,
75    RsLaunchDimensionsFieldY,
76    RsLaunchDimensionsFieldZ,
77    RsLaunchDimensionsFieldLod,
78    RsLaunchDimensionsFieldFace,
79    RsLaunchDimensionsFieldArray,
80
81    RsLaunchDimensionsFieldCount
82  };
83
84  enum RsExpandKernelDriverInfoPfxField {
85    RsExpandKernelDriverInfoPfxFieldInPtr,
86    RsExpandKernelDriverInfoPfxFieldInStride,
87    RsExpandKernelDriverInfoPfxFieldInLen,
88    RsExpandKernelDriverInfoPfxFieldOutPtr,
89    RsExpandKernelDriverInfoPfxFieldOutStride,
90    RsExpandKernelDriverInfoPfxFieldOutLen,
91    RsExpandKernelDriverInfoPfxFieldDim,
92    RsExpandKernelDriverInfoPfxFieldCurrent,
93    RsExpandKernelDriverInfoPfxFieldUsr,
94    RsExpandKernelDriverInfoPfxFieldUsLenr,
95
96    RsExpandKernelDriverInfoPfxFieldCount
97  };
98
99  llvm::Module *Module;
100  llvm::LLVMContext *Context;
101
102  /*
103   * Pointers to LLVM type information for the the function signatures
104   * for expanded functions. These must be re-calculated for each module
105   * the pass is run on.
106   */
107  llvm::FunctionType *ExpandedForEachType, *ExpandedReduceType;
108
109  uint32_t mExportForEachCount;
110  const char **mExportForEachNameList;
111  const uint32_t *mExportForEachSignatureList;
112
113  uint32_t mExportReduceCount;
114  const char **mExportReduceNameList;
115
116  // Turns on optimization of allocation stride values.
117  bool mEnableStepOpt;
118
119  uint32_t getRootSignature(llvm::Function *Function) {
120    const llvm::NamedMDNode *ExportForEachMetadata =
121        Module->getNamedMetadata("#rs_export_foreach");
122
123    if (!ExportForEachMetadata) {
124      llvm::SmallVector<llvm::Type*, 8> RootArgTys;
125      for (llvm::Function::arg_iterator B = Function->arg_begin(),
126                                        E = Function->arg_end();
127           B != E;
128           ++B) {
129        RootArgTys.push_back(B->getType());
130      }
131
132      // For pre-ICS bitcode, we may not have signature information. In that
133      // case, we use the size of the RootArgTys to select the number of
134      // arguments.
135      return (1 << RootArgTys.size()) - 1;
136    }
137
138    if (ExportForEachMetadata->getNumOperands() == 0) {
139      return 0;
140    }
141
142    bccAssert(ExportForEachMetadata->getNumOperands() > 0);
143
144    // We only handle the case for legacy root() functions here, so this is
145    // hard-coded to look at only the first such function.
146    llvm::MDNode *SigNode = ExportForEachMetadata->getOperand(0);
147    if (SigNode != nullptr && SigNode->getNumOperands() == 1) {
148      llvm::Metadata *SigMD = SigNode->getOperand(0);
149      if (llvm::MDString *SigS = llvm::dyn_cast<llvm::MDString>(SigMD)) {
150        llvm::StringRef SigString = SigS->getString();
151        uint32_t Signature = 0;
152        if (SigString.getAsInteger(10, Signature)) {
153          ALOGE("Non-integer signature value '%s'", SigString.str().c_str());
154          return 0;
155        }
156        return Signature;
157      }
158    }
159
160    return 0;
161  }
162
163  bool isStepOptSupported(llvm::Type *AllocType) {
164
165    llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
166    llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context);
167
168    if (mEnableStepOpt) {
169      return false;
170    }
171
172    if (AllocType == VoidPtrTy) {
173      return false;
174    }
175
176    if (!PT) {
177      return false;
178    }
179
180    // remaining conditions are 64-bit only
181    if (VoidPtrTy->getPrimitiveSizeInBits() == 32) {
182      return true;
183    }
184
185    // coerce suggests an upconverted struct type, which we can't support
186    if (AllocType->getStructName().find("coerce") != llvm::StringRef::npos) {
187      return false;
188    }
189
190    // 2xi64 and i128 suggest an upconverted struct type, which are also unsupported
191    llvm::Type *V2xi64Ty = llvm::VectorType::get(llvm::Type::getInt64Ty(*Context), 2);
192    llvm::Type *Int128Ty = llvm::Type::getIntNTy(*Context, 128);
193    if (AllocType == V2xi64Ty || AllocType == Int128Ty) {
194      return false;
195    }
196
197    return true;
198  }
199
200  // Get the actual value we should use to step through an allocation.
201  //
202  // Normally the value we use to step through an allocation is given to us by
203  // the driver. However, for certain primitive data types, we can derive an
204  // integer constant for the step value. We use this integer constant whenever
205  // possible to allow further compiler optimizations to take place.
206  //
207  // DL - Target Data size/layout information.
208  // T - Type of allocation (should be a pointer).
209  // OrigStep - Original step increment (root.expand() input from driver).
210  llvm::Value *getStepValue(llvm::DataLayout *DL, llvm::Type *AllocType,
211                            llvm::Value *OrigStep) {
212    bccAssert(DL);
213    bccAssert(AllocType);
214    bccAssert(OrigStep);
215    llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
216    if (isStepOptSupported(AllocType)) {
217      llvm::Type *ET = PT->getElementType();
218      uint64_t ETSize = DL->getTypeAllocSize(ET);
219      llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
220      return llvm::ConstantInt::get(Int32Ty, ETSize);
221    } else {
222      return OrigStep;
223    }
224  }
225
226  /// Builds the types required by the pass for the given context.
227  void buildTypes(void) {
228    // Create the RsLaunchDimensionsTy and RsExpandKernelDriverInfoPfxTy structs.
229
230    llvm::Type *Int8Ty                   = llvm::Type::getInt8Ty(*Context);
231    llvm::Type *Int8PtrTy                = Int8Ty->getPointerTo();
232    llvm::Type *Int8PtrArrayInputLimitTy = llvm::ArrayType::get(Int8PtrTy, RS_KERNEL_INPUT_LIMIT);
233    llvm::Type *Int32Ty                  = llvm::Type::getInt32Ty(*Context);
234    llvm::Type *Int32ArrayInputLimitTy   = llvm::ArrayType::get(Int32Ty, RS_KERNEL_INPUT_LIMIT);
235    llvm::Type *VoidPtrTy                = llvm::Type::getInt8PtrTy(*Context);
236    llvm::Type *Int32Array4Ty            = llvm::ArrayType::get(Int32Ty, 4);
237
238    /* Defined in frameworks/base/libs/rs/cpu_ref/rsCpuCore.h:
239     *
240     * struct RsLaunchDimensions {
241     *   uint32_t x;
242     *   uint32_t y;
243     *   uint32_t z;
244     *   uint32_t lod;
245     *   uint32_t face;
246     *   uint32_t array[4];
247     * };
248     */
249    llvm::SmallVector<llvm::Type*, RsLaunchDimensionsFieldCount> RsLaunchDimensionsTypes;
250    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t x
251    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t y
252    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t z
253    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t lod
254    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t face
255    RsLaunchDimensionsTypes.push_back(Int32Array4Ty); // uint32_t array[4]
256    llvm::StructType *RsLaunchDimensionsTy =
257        llvm::StructType::create(RsLaunchDimensionsTypes, "RsLaunchDimensions");
258
259    /* Defined as the beginning of RsExpandKernelDriverInfo in frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h:
260     *
261     * struct RsExpandKernelDriverInfoPfx {
262     *     const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT];
263     *     uint32_t inStride[RS_KERNEL_INPUT_LIMIT];
264     *     uint32_t inLen;
265     *
266     *     uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT];
267     *     uint32_t outStride[RS_KERNEL_INPUT_LIMIT];
268     *     uint32_t outLen;
269     *
270     *     // Dimension of the launch
271     *     RsLaunchDimensions dim;
272     *
273     *     // The walking iterator of the launch
274     *     RsLaunchDimensions current;
275     *
276     *     const void *usr;
277     *     uint32_t usrLen;
278     *
279     *     // Items below this line are not used by the compiler and can be change in the driver.
280     *     // So the compiler must assume there are an unknown number of fields of unknown type
281     *     // beginning here.
282     * };
283     *
284     * The name "RsExpandKernelDriverInfoPfx" is known to RSInvariantPass (RSInvariant.cpp).
285     */
286    llvm::SmallVector<llvm::Type*, RsExpandKernelDriverInfoPfxFieldCount> RsExpandKernelDriverInfoPfxTypes;
287    RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT]
288    RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy);   // uint32_t inStride[RS_KERNEL_INPUT_LIMIT]
289    RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t inLen
290    RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT]
291    RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy);   // uint32_t outStride[RS_KERNEL_INPUT_LIMIT]
292    RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t outLen
293    RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy);     // RsLaunchDimensions dim
294    RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy);     // RsLaunchDimensions current
295    RsExpandKernelDriverInfoPfxTypes.push_back(VoidPtrTy);                // const void *usr
296    RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t usrLen
297    llvm::StructType *RsExpandKernelDriverInfoPfxTy =
298        llvm::StructType::create(RsExpandKernelDriverInfoPfxTypes, "RsExpandKernelDriverInfoPfx");
299
300    // Create the function type for expanded kernels.
301    llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context);
302
303    llvm::Type *RsExpandKernelDriverInfoPfxPtrTy = RsExpandKernelDriverInfoPfxTy->getPointerTo();
304    // void (const RsExpandKernelDriverInfoPfxTy *p, uint32_t x1, uint32_t x2, uint32_t outstep)
305    ExpandedForEachType = llvm::FunctionType::get(VoidTy,
306        {RsExpandKernelDriverInfoPfxPtrTy, Int32Ty, Int32Ty, Int32Ty}, false);
307
308    // void (void *inBuf, void *outBuf, uint32_t len)
309    ExpandedReduceType = llvm::FunctionType::get(VoidTy, {VoidPtrTy, VoidPtrTy, Int32Ty}, false);
310  }
311
312  /// @brief Create skeleton of the expanded foreach kernel.
313  ///
314  /// This creates a function with the following signature:
315  ///
316  ///   void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2,
317  ///         uint32_t outstep)
318  ///
319  llvm::Function *createEmptyExpandedForEachKernel(llvm::StringRef OldName) {
320    llvm::Function *ExpandedFunction =
321      llvm::Function::Create(ExpandedForEachType,
322                             llvm::GlobalValue::ExternalLinkage,
323                             OldName + ".expand", Module);
324    bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
325    llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
326    (AI++)->setName("p");
327    (AI++)->setName("x1");
328    (AI++)->setName("x2");
329    (AI++)->setName("arg_outstep");
330    llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
331                                                       ExpandedFunction);
332    llvm::IRBuilder<> Builder(Begin);
333    Builder.CreateRetVoid();
334    return ExpandedFunction;
335  }
336
337  // Create skeleton of the expanded reduce kernel.
338  //
339  // This creates a function with the following signature:
340  //
341  //   void @func.expand(i8* nocapture %inBuf, i8* nocapture %outBuf, i32 len)
342  //
343  llvm::Function *createEmptyExpandedReduceKernel(llvm::StringRef OldName) {
344    llvm::Function *ExpandedFunction =
345      llvm::Function::Create(ExpandedReduceType,
346                             llvm::GlobalValue::ExternalLinkage,
347                             OldName + ".expand", Module);
348    bccAssert(ExpandedFunction->arg_size() == kNumExpandedReduceParams);
349
350    llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
351
352    using llvm::Attribute;
353
354    llvm::Argument *InBuf = &(*AI++);
355    InBuf->setName("inBuf");
356    InBuf->addAttr(llvm::AttributeSet::get(*Context, InBuf->getArgNo() + 1, llvm::makeArrayRef(Attribute::NoCapture)));
357
358    llvm::Argument *OutBuf = &(*AI++);
359    OutBuf->setName("outBuf");
360    OutBuf->addAttr(llvm::AttributeSet::get(*Context, OutBuf->getArgNo() + 1, llvm::makeArrayRef(Attribute::NoCapture)));
361
362    (AI++)->setName("len");
363
364    llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
365                                                       ExpandedFunction);
366    llvm::IRBuilder<> Builder(Begin);
367    Builder.CreateRetVoid();
368
369    return ExpandedFunction;
370  }
371
372  /// @brief Create an empty loop
373  ///
374  /// Create a loop of the form:
375  ///
376  /// for (i = LowerBound; i < UpperBound; i++)
377  ///   ;
378  ///
379  /// After the loop has been created, the builder is set such that
380  /// instructions can be added to the loop body.
381  ///
382  /// @param Builder The builder to use to build this loop. The current
383  ///                position of the builder is the position the loop
384  ///                will be inserted.
385  /// @param LowerBound The first value of the loop iterator
386  /// @param UpperBound The maximal value of the loop iterator
387  /// @param LoopIV A reference that will be set to the loop iterator.
388  /// @return The BasicBlock that will be executed after the loop.
389  llvm::BasicBlock *createLoop(llvm::IRBuilder<> &Builder,
390                               llvm::Value *LowerBound,
391                               llvm::Value *UpperBound,
392                               llvm::PHINode **LoopIV) {
393    bccAssert(LowerBound->getType() == UpperBound->getType());
394
395    llvm::BasicBlock *CondBB, *AfterBB, *HeaderBB;
396    llvm::Value *Cond, *IVNext;
397    llvm::PHINode *IV;
398
399    CondBB = Builder.GetInsertBlock();
400    AfterBB = llvm::SplitBlock(CondBB, Builder.GetInsertPoint(), nullptr, nullptr);
401    HeaderBB = llvm::BasicBlock::Create(*Context, "Loop", CondBB->getParent());
402
403    // if (LowerBound < Upperbound)
404    //   goto LoopHeader
405    // else
406    //   goto AfterBB
407    CondBB->getTerminator()->eraseFromParent();
408    Builder.SetInsertPoint(CondBB);
409    Cond = Builder.CreateICmpULT(LowerBound, UpperBound);
410    Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
411
412    // iv = PHI [CondBB -> LowerBound], [LoopHeader -> NextIV ]
413    // iv.next = iv + 1
414    // if (iv.next < Upperbound)
415    //   goto LoopHeader
416    // else
417    //   goto AfterBB
418    Builder.SetInsertPoint(HeaderBB);
419    IV = Builder.CreatePHI(LowerBound->getType(), 2, "X");
420    IV->addIncoming(LowerBound, CondBB);
421    IVNext = Builder.CreateNUWAdd(IV, Builder.getInt32(1));
422    IV->addIncoming(IVNext, HeaderBB);
423    Cond = Builder.CreateICmpULT(IVNext, UpperBound);
424    Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
425    AfterBB->setName("Exit");
426    Builder.SetInsertPoint(HeaderBB->getFirstNonPHI());
427    *LoopIV = IV;
428    return AfterBB;
429  }
430
431  // Finish building the outgoing argument list for calling a ForEach-able function.
432  //
433  // ArgVector - on input, the non-special arguments
434  //             on output, the non-special arguments combined with the special arguments
435  //               from SpecialArgVector
436  // SpecialArgVector - special arguments (from ExpandSpecialArguments())
437  // SpecialArgContextIdx - return value of ExpandSpecialArguments()
438  //                          (position of context argument in SpecialArgVector)
439  // CalleeFunction - the ForEach-able function being called
440  // Builder - for inserting code into the caller function
441  template<unsigned int ArgVectorLen, unsigned int SpecialArgVectorLen>
442  void finishArgList(      llvm::SmallVector<llvm::Value *, ArgVectorLen>        &ArgVector,
443                     const llvm::SmallVector<llvm::Value *, SpecialArgVectorLen> &SpecialArgVector,
444                     const int SpecialArgContextIdx,
445                     const llvm::Function &CalleeFunction,
446                     llvm::IRBuilder<> &CallerBuilder) {
447    /* The context argument (if any) is a pointer to an opaque user-visible type that differs from
448     * the RsExpandKernelDriverInfoPfx type used in the function we are generating (although the
449     * two types represent the same thing).  Therefore, we must introduce a pointer cast when
450     * generating a call to the kernel function.
451     */
452    const int ArgContextIdx =
453        SpecialArgContextIdx >= 0 ? (ArgVector.size() + SpecialArgContextIdx) : SpecialArgContextIdx;
454    ArgVector.append(SpecialArgVector.begin(), SpecialArgVector.end());
455    if (ArgContextIdx >= 0) {
456      llvm::Type *ContextArgType = nullptr;
457      int ArgIdx = ArgContextIdx;
458      for (const auto &Arg : CalleeFunction.getArgumentList()) {
459        if (!ArgIdx--) {
460          ContextArgType = Arg.getType();
461          break;
462        }
463      }
464      bccAssert(ContextArgType);
465      ArgVector[ArgContextIdx] = CallerBuilder.CreatePointerCast(ArgVector[ArgContextIdx], ContextArgType);
466    }
467  }
468
469  // GEPHelper() returns a SmallVector of values suitable for passing
470  // to IRBuilder::CreateGEP(), and SmallGEPIndices is a typedef for
471  // the returned data type. It is sized so that the SmallVector
472  // returned by GEPHelper() never needs to do a heap allocation for
473  // any list of GEP indices it encounters in the code.
474  typedef llvm::SmallVector<llvm::Value *, 3> SmallGEPIndices;
475
476  // Helper for turning a list of constant integer GEP indices into a
477  // SmallVector of llvm::Value*. The return value is suitable for
478  // passing to a GetElementPtrInst constructor or IRBuilder::CreateGEP().
479  //
480  // Inputs:
481  //   I32Args should be integers which represent the index arguments
482  //   to a GEP instruction.
483  //
484  // Returns:
485  //   Returns a SmallVector of ConstantInts.
486  SmallGEPIndices GEPHelper(const std::initializer_list<int32_t> I32Args) {
487    SmallGEPIndices Out(I32Args.size());
488    llvm::IntegerType *I32Ty = llvm::Type::getInt32Ty(*Context);
489    std::transform(I32Args.begin(), I32Args.end(), Out.begin(),
490                   [I32Ty](int32_t Arg) { return llvm::ConstantInt::get(I32Ty, Arg); });
491    return Out;
492  }
493
494public:
495  RSKernelExpandPass(bool pEnableStepOpt = true)
496      : ModulePass(ID), Module(nullptr), Context(nullptr),
497        mEnableStepOpt(pEnableStepOpt) {
498
499  }
500
501  virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
502    // This pass does not use any other analysis passes, but it does
503    // add/wrap the existing functions in the module (thus altering the CFG).
504  }
505
506  // Build contribution to outgoing argument list for calling a
507  // ForEach-able function, based on the special parameters of that
508  // function.
509  //
510  // Signature - metadata bits for the signature of the ForEach-able function
511  // X, Arg_p - values derived directly from expanded function,
512  //            suitable for computing arguments for the ForEach-able function
513  // CalleeArgs - contribution is accumulated here
514  // Bump - invoked once for each contributed outgoing argument
515  // LoopHeaderInsertionPoint - an Instruction in the loop header, before which
516  //                            this function can insert loop-invariant loads
517  //
518  // Return value is the (zero-based) position of the context (Arg_p)
519  // argument in the CalleeArgs vector, or a negative value if the
520  // context argument is not placed in the CalleeArgs vector.
521  int ExpandSpecialArguments(uint32_t Signature,
522                             llvm::Value *X,
523                             llvm::Value *Arg_p,
524                             llvm::IRBuilder<> &Builder,
525                             llvm::SmallVector<llvm::Value*, 8> &CalleeArgs,
526                             std::function<void ()> Bump,
527                             llvm::Instruction *LoopHeaderInsertionPoint) {
528
529    bccAssert(CalleeArgs.empty());
530
531    int Return = -1;
532    if (bcinfo::MetadataExtractor::hasForEachSignatureCtxt(Signature)) {
533      CalleeArgs.push_back(Arg_p);
534      Bump();
535      Return = CalleeArgs.size() - 1;
536    }
537
538    if (bcinfo::MetadataExtractor::hasForEachSignatureX(Signature)) {
539      CalleeArgs.push_back(X);
540      Bump();
541    }
542
543    if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature) ||
544        bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
545      bccAssert(LoopHeaderInsertionPoint);
546
547      // Y and Z are loop invariant, so they can be hoisted out of the
548      // loop. Set the IRBuilder insertion point to the loop header.
549      auto OldInsertionPoint = Builder.saveIP();
550      Builder.SetInsertPoint(LoopHeaderInsertionPoint);
551
552      if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature)) {
553        SmallGEPIndices YValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent,
554          RsLaunchDimensionsFieldY}));
555        llvm::Value *YAddr = Builder.CreateInBoundsGEP(Arg_p, YValueGEP, "Y.gep");
556        CalleeArgs.push_back(Builder.CreateLoad(YAddr, "Y"));
557        Bump();
558      }
559
560      if (bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
561        SmallGEPIndices ZValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent,
562          RsLaunchDimensionsFieldZ}));
563        llvm::Value *ZAddr = Builder.CreateInBoundsGEP(Arg_p, ZValueGEP, "Z.gep");
564        CalleeArgs.push_back(Builder.CreateLoad(ZAddr, "Z"));
565        Bump();
566      }
567
568      Builder.restoreIP(OldInsertionPoint);
569    }
570
571    return Return;
572  }
573
574  /* Performs the actual optimization on a selected function. On success, the
575   * Module will contain a new function of the name "<NAME>.expand" that
576   * invokes <NAME>() in a loop with the appropriate parameters.
577   */
578  bool ExpandOldStyleForEach(llvm::Function *Function, uint32_t Signature) {
579    ALOGV("Expanding ForEach-able Function %s",
580          Function->getName().str().c_str());
581
582    if (!Signature) {
583      Signature = getRootSignature(Function);
584      if (!Signature) {
585        // We couldn't determine how to expand this function based on its
586        // function signature.
587        return false;
588      }
589    }
590
591    llvm::DataLayout DL(Module);
592
593    llvm::Function *ExpandedFunction =
594      createEmptyExpandedForEachKernel(Function->getName());
595
596    /*
597     * Extract the expanded function's parameters.  It is guaranteed by
598     * createEmptyExpandedFunction that there will be five parameters.
599     */
600
601    bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
602
603    llvm::Function::arg_iterator ExpandedFunctionArgIter =
604      ExpandedFunction->arg_begin();
605
606    llvm::Value *Arg_p       = &*(ExpandedFunctionArgIter++);
607    llvm::Value *Arg_x1      = &*(ExpandedFunctionArgIter++);
608    llvm::Value *Arg_x2      = &*(ExpandedFunctionArgIter++);
609    llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter);
610
611    llvm::Value *InStep  = nullptr;
612    llvm::Value *OutStep = nullptr;
613
614    // Construct the actual function body.
615    llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
616
617    // Collect and construct the arguments for the kernel().
618    // Note that we load any loop-invariant arguments before entering the Loop.
619    llvm::Function::arg_iterator FunctionArgIter = Function->arg_begin();
620
621    llvm::Type  *InTy      = nullptr;
622    llvm::Value *InBufPtr = nullptr;
623    if (bcinfo::MetadataExtractor::hasForEachSignatureIn(Signature)) {
624      SmallGEPIndices InStepGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInStride, 0}));
625      llvm::LoadInst *InStepArg  = Builder.CreateLoad(
626        Builder.CreateInBoundsGEP(Arg_p, InStepGEP, "instep_addr.gep"), "instep_addr");
627
628      InTy = (FunctionArgIter++)->getType();
629      InStep = getStepValue(&DL, InTy, InStepArg);
630
631      InStep->setName("instep");
632
633      SmallGEPIndices InputAddrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr, 0}));
634      InBufPtr = Builder.CreateLoad(
635        Builder.CreateInBoundsGEP(Arg_p, InputAddrGEP, "input_buf.gep"), "input_buf");
636    }
637
638    llvm::Type *OutTy = nullptr;
639    llvm::Value *OutBasePtr = nullptr;
640    if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
641      OutTy = (FunctionArgIter++)->getType();
642      OutStep = getStepValue(&DL, OutTy, Arg_outstep);
643      OutStep->setName("outstep");
644      SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0}));
645      OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep"));
646    }
647
648    llvm::Value *UsrData = nullptr;
649    if (bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)) {
650      llvm::Type *UsrDataTy = (FunctionArgIter++)->getType();
651      llvm::Value *UsrDataPointerAddr = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldUsr);
652      UsrData = Builder.CreatePointerCast(Builder.CreateLoad(UsrDataPointerAddr), UsrDataTy);
653      UsrData->setName("UsrData");
654    }
655
656    llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock();
657    llvm::PHINode *IV;
658    createLoop(Builder, Arg_x1, Arg_x2, &IV);
659
660    llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
661    const int CalleeArgsContextIdx = ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
662                                                            [&FunctionArgIter]() { FunctionArgIter++; },
663                                                            LoopHeader->getTerminator());
664
665    bccAssert(FunctionArgIter == Function->arg_end());
666
667    // Populate the actual call to kernel().
668    llvm::SmallVector<llvm::Value*, 8> RootArgs;
669
670    llvm::Value *InPtr  = nullptr;
671    llvm::Value *OutPtr = nullptr;
672
673    // Calculate the current input and output pointers
674    //
675    // We always calculate the input/output pointers with a GEP operating on i8
676    // values and only cast at the very end to OutTy. This is because the step
677    // between two values is given in bytes.
678    //
679    // TODO: We could further optimize the output by using a GEP operation of
680    // type 'OutTy' in cases where the element type of the allocation allows.
681    if (OutBasePtr) {
682      llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
683      OutOffset = Builder.CreateMul(OutOffset, OutStep);
684      OutPtr = Builder.CreateInBoundsGEP(OutBasePtr, OutOffset);
685      OutPtr = Builder.CreatePointerCast(OutPtr, OutTy);
686    }
687
688    if (InBufPtr) {
689      llvm::Value *InOffset = Builder.CreateSub(IV, Arg_x1);
690      InOffset = Builder.CreateMul(InOffset, InStep);
691      InPtr = Builder.CreateInBoundsGEP(InBufPtr, InOffset);
692      InPtr = Builder.CreatePointerCast(InPtr, InTy);
693    }
694
695    if (InPtr) {
696      RootArgs.push_back(InPtr);
697    }
698
699    if (OutPtr) {
700      RootArgs.push_back(OutPtr);
701    }
702
703    if (UsrData) {
704      RootArgs.push_back(UsrData);
705    }
706
707    finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder);
708
709    Builder.CreateCall(Function, RootArgs);
710
711    return true;
712  }
713
714  /* Expand a pass-by-value foreach kernel.
715   */
716  bool ExpandForEach(llvm::Function *Function, uint32_t Signature) {
717    bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature));
718    ALOGV("Expanding kernel Function %s", Function->getName().str().c_str());
719
720    // TODO: Refactor this to share functionality with ExpandOldStyleForEach.
721    llvm::DataLayout DL(Module);
722
723    llvm::Function *ExpandedFunction =
724      createEmptyExpandedForEachKernel(Function->getName());
725
726    /*
727     * Extract the expanded function's parameters.  It is guaranteed by
728     * createEmptyExpandedFunction that there will be five parameters.
729     */
730
731    bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
732
733    llvm::Function::arg_iterator ExpandedFunctionArgIter =
734      ExpandedFunction->arg_begin();
735
736    llvm::Value *Arg_p       = &*(ExpandedFunctionArgIter++);
737    llvm::Value *Arg_x1      = &*(ExpandedFunctionArgIter++);
738    llvm::Value *Arg_x2      = &*(ExpandedFunctionArgIter++);
739    llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter);
740
741    // Construct the actual function body.
742    llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
743
744    // Create TBAA meta-data.
745    llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript,
746                 *TBAAAllocation, *TBAAPointer;
747    llvm::MDBuilder MDHelper(*Context);
748
749    TBAARenderScriptDistinct =
750      MDHelper.createTBAARoot(kRenderScriptTBAARootName);
751    TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
752        TBAARenderScriptDistinct);
753    TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
754                                                       TBAARenderScript);
755    TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
756                                                      TBAAAllocation, 0);
757    TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer",
758                                                    TBAARenderScript);
759    TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0);
760
761    /*
762     * Collect and construct the arguments for the kernel().
763     *
764     * Note that we load any loop-invariant arguments before entering the Loop.
765     */
766    size_t NumRemainingInputs = Function->arg_size();
767
768    // No usrData parameter on kernels.
769    bccAssert(
770        !bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature));
771
772    llvm::Function::arg_iterator ArgIter = Function->arg_begin();
773
774    // Check the return type
775    llvm::Type     *OutTy            = nullptr;
776    llvm::Value    *OutStep          = nullptr;
777    llvm::LoadInst *OutBasePtr       = nullptr;
778    llvm::Value    *CastedOutBasePtr = nullptr;
779
780    bool PassOutByPointer = false;
781
782    if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
783      llvm::Type *OutBaseTy = Function->getReturnType();
784
785      if (OutBaseTy->isVoidTy()) {
786        PassOutByPointer = true;
787        OutTy = ArgIter->getType();
788
789        ArgIter++;
790        --NumRemainingInputs;
791      } else {
792        // We don't increment Args, since we are using the actual return type.
793        OutTy = OutBaseTy->getPointerTo();
794      }
795
796      OutStep = getStepValue(&DL, OutTy, Arg_outstep);
797      OutStep->setName("outstep");
798      SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0}));
799      OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep"));
800
801      if (gEnableRsTbaa) {
802        OutBasePtr->setMetadata("tbaa", TBAAPointer);
803      }
804
805      CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out");
806    }
807
808    llvm::SmallVector<llvm::Type*,  8> InTypes;
809    llvm::SmallVector<llvm::Value*, 8> InSteps;
810    llvm::SmallVector<llvm::Value*, 8> InBufPtrs;
811    llvm::SmallVector<llvm::Value*, 8> InStructTempSlots;
812
813    bccAssert(NumRemainingInputs <= RS_KERNEL_INPUT_LIMIT);
814
815    // Create the loop structure.
816    llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock();
817    llvm::PHINode *IV;
818    createLoop(Builder, Arg_x1, Arg_x2, &IV);
819
820    llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
821    const int CalleeArgsContextIdx =
822      ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
823                             [&NumRemainingInputs]() { --NumRemainingInputs; },
824                             LoopHeader->getTerminator());
825
826    // After ExpandSpecialArguments() gets called, NumRemainingInputs
827    // counts the number of arguments to the kernel that correspond to
828    // an array entry from the InPtr field of the DriverInfo
829    // structure.
830    const size_t NumInPtrArguments = NumRemainingInputs;
831
832    if (NumInPtrArguments > 0) {
833      // Extract information about input slots and step sizes. The work done
834      // here is loop-invariant, so we can hoist the operations out of the loop.
835      auto OldInsertionPoint = Builder.saveIP();
836      Builder.SetInsertPoint(LoopHeader->getTerminator());
837
838      for (size_t InputIndex = 0; InputIndex < NumInPtrArguments; ++InputIndex, ArgIter++) {
839        SmallGEPIndices InStepGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInStride,
840          static_cast<int32_t>(InputIndex)}));
841        llvm::Value *InStepAddr = Builder.CreateInBoundsGEP(Arg_p, InStepGEP, "instep_addr.gep");
842        llvm::LoadInst *InStepArg = Builder.CreateLoad(InStepAddr, "instep_addr");
843
844        llvm::Type *InType = ArgIter->getType();
845
846        /*
847         * AArch64 calling conventions dictate that structs of sufficient size
848         * get passed by pointer instead of passed by value.  This, combined
849         * with the fact that we don't allow kernels to operate on pointer
850         * data means that if we see a kernel with a pointer parameter we know
851         * that it is a struct input that has been promoted.  As such we don't
852         * need to convert its type to a pointer.  Later we will need to know
853         * to create a temporary copy on the stack, so we save this information
854         * in InStructTempSlots.
855         */
856        if (auto PtrType = llvm::dyn_cast<llvm::PointerType>(InType)) {
857          llvm::Type *ElementType = PtrType->getElementType();
858          InStructTempSlots.push_back(Builder.CreateAlloca(ElementType, nullptr,
859                                                           "input_struct_slot"));
860        } else {
861          InType = InType->getPointerTo();
862          InStructTempSlots.push_back(nullptr);
863        }
864
865        llvm::Value *InStep = getStepValue(&DL, InType, InStepArg);
866
867        InStep->setName("instep");
868
869        SmallGEPIndices InBufPtrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr,
870          static_cast<int32_t>(InputIndex)}));
871        llvm::Value    *InBufPtrAddr = Builder.CreateInBoundsGEP(Arg_p, InBufPtrGEP, "input_buf.gep");
872        llvm::LoadInst *InBufPtr = Builder.CreateLoad(InBufPtrAddr, "input_buf");
873        llvm::Value    *CastInBufPtr = Builder.CreatePointerCast(InBufPtr, InType, "casted_in");
874        if (gEnableRsTbaa) {
875          InBufPtr->setMetadata("tbaa", TBAAPointer);
876        }
877
878        InTypes.push_back(InType);
879        InSteps.push_back(InStep);
880        InBufPtrs.push_back(CastInBufPtr);
881      }
882
883      Builder.restoreIP(OldInsertionPoint);
884    }
885
886    // Populate the actual call to kernel().
887    llvm::SmallVector<llvm::Value*, 8> RootArgs;
888
889    // Calculate the current input and output pointers.
890
891    // Output
892
893    llvm::Value *OutPtr = nullptr;
894    if (CastedOutBasePtr) {
895      llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
896      OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffset);
897
898      if (PassOutByPointer) {
899        RootArgs.push_back(OutPtr);
900      }
901    }
902
903    // Inputs
904
905    if (NumInPtrArguments > 0) {
906      llvm::Value *Offset = Builder.CreateSub(IV, Arg_x1);
907
908      for (size_t Index = 0; Index < NumInPtrArguments; ++Index) {
909        llvm::Value *InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], Offset);
910        llvm::Value *Input;
911
912        llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input");
913
914        if (gEnableRsTbaa) {
915          InputLoad->setMetadata("tbaa", TBAAAllocation);
916        }
917
918        if (llvm::Value *TemporarySlot = InStructTempSlots[Index]) {
919          // Pass a pointer to a temporary on the stack, rather than
920          // passing a pointer to the original value. We do not want
921          // the kernel to potentially modify the input data.
922
923          // Note: don't annotate with TBAA, since the kernel might
924          // have its own TBAA annotations for the pointer argument.
925          Builder.CreateStore(InputLoad, TemporarySlot);
926          Input = TemporarySlot;
927        } else {
928          Input = InputLoad;
929        }
930
931        RootArgs.push_back(Input);
932      }
933    }
934
935    finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder);
936
937    llvm::Value *RetVal = Builder.CreateCall(Function, RootArgs);
938
939    if (OutPtr && !PassOutByPointer) {
940      RetVal->setName("call.result");
941      llvm::StoreInst *Store = Builder.CreateStore(RetVal, OutPtr);
942      if (gEnableRsTbaa) {
943        Store->setMetadata("tbaa", TBAAAllocation);
944      }
945    }
946
947    return true;
948  }
949
950  // Expand a reduce-style kernel function.
951  //
952  // The input is a kernel which represents a binary operation,
953  // of the form
954  //
955  //   define foo @func(foo %a, foo %b),
956  //
957  // (More generally, it can be of the forms
958  //
959  //   define void @func(foo* %ret, foo* %a, foo* %b)
960  //   define void @func(foo* %ret, foo1 %a, foo1 %b)
961  //   define foo1 @func(foo2 %a, foo2 %b)
962  //
963  // as a result of argument / return value conversions. Here, "foo1"
964  // and "foo2" refer to possibly coerced types, and the coerced
965  // argument type may be different from the coerced return type. See
966  // "Note on coercion" below.)
967  //
968  // Note also, we do not expect to encounter any case when the
969  // arguments are promoted to pointers but the return value is
970  // unpromoted to pointer, e.g.
971  //
972  //   define foo1 @func(foo* %a, foo* %b)
973  //
974  // and we will throw an assertion in this case.)
975  //
976  // The input kernel gets expanded into a kernel of the form
977  //
978  //   define void @func.expand(i8* %inBuf, i8* outBuf, i32 len)
979  //
980  // which performs a serial reduction of `len` elements from `inBuf`,
981  // and stores the result into `outBuf`. In pseudocode, @func.expand
982  // does:
983  //
984  //   inArr := (foo *)inBuf;
985  //   accum := inArr[0];
986  //   for (i := 1; i < len; ++i) {
987  //     accum := foo(accum, inArr[i]);
988  //   }
989  //   *(foo *)outBuf := accum;
990  //
991  // Note on coercion
992  //
993  // Both the return value and the argument types may undergo internal
994  // coercion in clang as part of call lowering. As a result, the
995  // return value type may differ from the argument type even if the
996  // types in the RenderScript signaure are the same. For instance, the
997  // kernel
998  //
999  //   int3 add(int3 a, int3 b) { return a + b; }
1000  //
1001  // gets lowered by clang as
1002  //
1003  //   define <3 x i32> @add(<4 x i32> %a.coerce, <4 x i32> %b.coerce)
1004  //
1005  // under AArch64. The details of this process are found in clang,
1006  // lib/CodeGen/TargetInfo.cpp, under classifyArgumentType() and
1007  // classifyReturnType() in ARMABIInfo, AArch64ABIInfo. If the value
1008  // is passed by pointer, then the pointed-to type is not coerced.
1009  //
1010  // Since we lack the original type information, this code does loads
1011  // and stores of allocation data by way of pointers to the coerced
1012  // type.
1013  bool ExpandReduce(llvm::Function *Function) {
1014    bccAssert(Function);
1015
1016    ALOGV("Expanding reduce kernel %s", Function->getName().str().c_str());
1017
1018    llvm::DataLayout DL(Module);
1019
1020    // TBAA Metadata
1021    llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, *TBAAAllocation;
1022    llvm::MDBuilder MDHelper(*Context);
1023
1024    TBAARenderScriptDistinct =
1025      MDHelper.createTBAARoot(kRenderScriptTBAARootName);
1026    TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
1027        TBAARenderScriptDistinct);
1028    TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
1029                                                       TBAARenderScript);
1030    TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
1031                                                      TBAAAllocation, 0);
1032
1033    llvm::Function *ExpandedFunction =
1034      createEmptyExpandedReduceKernel(Function->getName());
1035
1036    // Extract the expanded kernel's parameters.  It is guaranteed by
1037    // createEmptyExpandedFunction that there will be 3 parameters.
1038    auto ExpandedFunctionArgIter = ExpandedFunction->arg_begin();
1039
1040    llvm::Value *Arg_inBuf  = &*(ExpandedFunctionArgIter++);
1041    llvm::Value *Arg_outBuf = &*(ExpandedFunctionArgIter++);
1042    llvm::Value *Arg_len    = &*(ExpandedFunctionArgIter++);
1043
1044    bccAssert(Function->arg_size() == 2 || Function->arg_size() == 3);
1045
1046    // Check if, instead of returning a value, the original kernel has
1047    // a pointer parameter which points to a temporary buffer into
1048    // which the return value gets written.
1049    const bool ReturnValuePointerStyle = (Function->arg_size() == 3);
1050    bccAssert(Function->getReturnType()->isVoidTy() == ReturnValuePointerStyle);
1051
1052    // Check if, instead of being passed by value, the inputs to the
1053    // original kernel are passed by pointer.
1054    auto FirstArgIter = Function->arg_begin();
1055    // The second argument is always an input to the original kernel.
1056    auto SecondArgIter = std::next(FirstArgIter);
1057    const bool InputsPointerStyle = SecondArgIter->getType()->isPointerTy();
1058
1059    // Get the output type (i.e. return type of the original kernel).
1060    llvm::PointerType *OutPtrTy = nullptr;
1061    llvm::Type *OutTy = nullptr;
1062    if (ReturnValuePointerStyle) {
1063      OutPtrTy = llvm::dyn_cast<llvm::PointerType>(FirstArgIter->getType());
1064      bccAssert(OutPtrTy && "Expected a pointer parameter to kernel");
1065      OutTy = OutPtrTy->getElementType();
1066    } else {
1067      OutTy = Function->getReturnType();
1068      bccAssert(!OutTy->isVoidTy());
1069      OutPtrTy = OutTy->getPointerTo();
1070    }
1071
1072    // Get the input type (type of the arguments to the original
1073    // kernel). Some input types are different from the output type,
1074    // due to explicit coercion that the compiler performs when
1075    // lowering the parameters. See "Note on coercion" above.
1076    llvm::PointerType *InPtrTy;
1077    llvm::Type *InTy;
1078    if (InputsPointerStyle) {
1079      InPtrTy = llvm::dyn_cast<llvm::PointerType>(SecondArgIter->getType());
1080      bccAssert(InPtrTy && "Expected a pointer parameter to kernel");
1081      bccAssert(ReturnValuePointerStyle);
1082      bccAssert(std::next(SecondArgIter)->getType() == InPtrTy &&
1083                "Input type mismatch");
1084      InTy = InPtrTy->getElementType();
1085    } else {
1086      InTy = SecondArgIter->getType();
1087      InPtrTy = InTy->getPointerTo();
1088      if (!ReturnValuePointerStyle) {
1089        bccAssert(InTy == FirstArgIter->getType() && "Input type mismatch");
1090      } else {
1091        bccAssert(InTy == std::next(SecondArgIter)->getType() &&
1092                  "Input type mismatch");
1093      }
1094    }
1095
1096    // The input type should take up the same amount of space in
1097    // memory as the output type.
1098    bccAssert(DL.getTypeAllocSize(InTy) == DL.getTypeAllocSize(OutTy));
1099
1100    // Construct the actual function body.
1101    llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
1102
1103    // Cast input and output buffers to appropriate types.
1104    llvm::Value *InBuf = Builder.CreatePointerCast(Arg_inBuf, InPtrTy);
1105    llvm::Value *OutBuf = Builder.CreatePointerCast(Arg_outBuf, OutPtrTy);
1106
1107    // Create a slot to pass temporary results back. This needs to be
1108    // separate from the accumulator slot because the kernel may mark
1109    // the return value slot as noalias.
1110    llvm::Value *ReturnBuf = nullptr;
1111    if (ReturnValuePointerStyle) {
1112      ReturnBuf = Builder.CreateAlloca(OutTy, nullptr, "ret.tmp");
1113    }
1114
1115    // Create a slot to hold the second input if the inputs are passed
1116    // by pointer to the original kernel. We cannot directly pass a
1117    // pointer to the input buffer, because the kernel may modify its
1118    // inputs.
1119    llvm::Value *SecondInputTempBuf = nullptr;
1120    if (InputsPointerStyle) {
1121      SecondInputTempBuf = Builder.CreateAlloca(InTy, nullptr, "in.tmp");
1122    }
1123
1124    // Create a slot to accumulate temporary results, and fill it with
1125    // the first value.
1126    llvm::Value *AccumBuf = Builder.CreateAlloca(OutTy, nullptr, "accum");
1127    // Cast to OutPtrTy before loading, since AccumBuf has type OutPtrTy.
1128    llvm::LoadInst *FirstElementLoad = Builder.CreateLoad(
1129      Builder.CreatePointerCast(InBuf, OutPtrTy));
1130    if (gEnableRsTbaa) {
1131      FirstElementLoad->setMetadata("tbaa", TBAAAllocation);
1132    }
1133    // Memory operations with AccumBuf shouldn't be marked with
1134    // RenderScript TBAA, since this might conflict with TBAA metadata
1135    // in the kernel function when AccumBuf is passed by pointer.
1136    Builder.CreateStore(FirstElementLoad, AccumBuf);
1137
1138    // Loop body
1139
1140    // Create the loop structure. Note that the first input in the input buffer
1141    // has already been accumulated, so that we start at index 1.
1142    llvm::PHINode *IndVar;
1143    llvm::Value *Start = llvm::ConstantInt::get(Arg_len->getType(), 1);
1144    llvm::BasicBlock *Exit = createLoop(Builder, Start, Arg_len, &IndVar);
1145
1146    llvm::Value *InputPtr = Builder.CreateInBoundsGEP(InBuf, IndVar, "next_input.gep");
1147
1148    // Set up arguments and call the original (unexpanded) kernel.
1149    //
1150    // The original kernel can have at most 3 arguments, which is
1151    // achieved when the signature looks like:
1152    //
1153    //    define void @func(foo* %ret, bar %a, bar %b)
1154    //
1155    // (bar can be one of foo/foo.coerce/foo*).
1156    llvm::SmallVector<llvm::Value *, 3> KernelArgs;
1157
1158    if (ReturnValuePointerStyle) {
1159      KernelArgs.push_back(ReturnBuf);
1160    }
1161
1162    if (InputsPointerStyle) {
1163      bccAssert(ReturnValuePointerStyle);
1164      // Because the return buffer is copied back into the
1165      // accumulator, it's okay if the accumulator is overwritten.
1166      KernelArgs.push_back(AccumBuf);
1167
1168      llvm::LoadInst *InputLoad = Builder.CreateLoad(InputPtr);
1169      if (gEnableRsTbaa) {
1170        InputLoad->setMetadata("tbaa", TBAAAllocation);
1171      }
1172      Builder.CreateStore(InputLoad, SecondInputTempBuf);
1173
1174      KernelArgs.push_back(SecondInputTempBuf);
1175    } else {
1176      // InPtrTy may be different from OutPtrTy (the type of
1177      // AccumBuf), so first cast the accumulator buffer to the
1178      // pointer type corresponding to the input argument type.
1179      KernelArgs.push_back(
1180        Builder.CreateLoad(Builder.CreatePointerCast(AccumBuf, InPtrTy)));
1181
1182      llvm::LoadInst *LoadedArg = Builder.CreateLoad(InputPtr);
1183      if (gEnableRsTbaa) {
1184        LoadedArg->setMetadata("tbaa", TBAAAllocation);
1185      }
1186      KernelArgs.push_back(LoadedArg);
1187    }
1188
1189    llvm::Value *RetVal = Builder.CreateCall(Function, KernelArgs);
1190
1191    const uint64_t ElementSize = DL.getTypeStoreSize(OutTy);
1192    const uint64_t ElementAlign = DL.getABITypeAlignment(OutTy);
1193
1194    // Store the output in the accumulator.
1195    if (ReturnValuePointerStyle) {
1196      Builder.CreateMemCpy(AccumBuf, ReturnBuf, ElementSize, ElementAlign);
1197    } else {
1198      Builder.CreateStore(RetVal, AccumBuf);
1199    }
1200
1201    // Loop exit
1202    Builder.SetInsertPoint(Exit, Exit->begin());
1203
1204    llvm::LoadInst *OutputLoad = Builder.CreateLoad(AccumBuf);
1205    llvm::StoreInst *OutputStore = Builder.CreateStore(OutputLoad, OutBuf);
1206    if (gEnableRsTbaa) {
1207      OutputStore->setMetadata("tbaa", TBAAAllocation);
1208    }
1209
1210    return true;
1211  }
1212
1213  /// @brief Checks if pointers to allocation internals are exposed
1214  ///
1215  /// This function verifies if through the parameters passed to the kernel
1216  /// or through calls to the runtime library the script gains access to
1217  /// pointers pointing to data within a RenderScript Allocation.
1218  /// If we know we control all loads from and stores to data within
1219  /// RenderScript allocations and if we know the run-time internal accesses
1220  /// are all annotated with RenderScript TBAA metadata, only then we
1221  /// can safely use TBAA to distinguish between generic and from-allocation
1222  /// pointers.
1223  bool allocPointersExposed(llvm::Module &Module) {
1224    // Old style kernel function can expose pointers to elements within
1225    // allocations.
1226    // TODO: Extend analysis to allow simple cases of old-style kernels.
1227    for (size_t i = 0; i < mExportForEachCount; ++i) {
1228      const char *Name = mExportForEachNameList[i];
1229      uint32_t Signature = mExportForEachSignatureList[i];
1230      if (Module.getFunction(Name) &&
1231          !bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)) {
1232        return true;
1233      }
1234    }
1235
1236    // Check for library functions that expose a pointer to an Allocation or
1237    // that are not yet annotated with RenderScript-specific tbaa information.
1238    static const std::vector<const char *> Funcs{
1239      // rsGetElementAt(...)
1240      "_Z14rsGetElementAt13rs_allocationj",
1241      "_Z14rsGetElementAt13rs_allocationjj",
1242      "_Z14rsGetElementAt13rs_allocationjjj",
1243
1244      // rsSetElementAt()
1245      "_Z14rsSetElementAt13rs_allocationPvj",
1246      "_Z14rsSetElementAt13rs_allocationPvjj",
1247      "_Z14rsSetElementAt13rs_allocationPvjjj",
1248
1249      // rsGetElementAtYuv_uchar_Y()
1250      "_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj",
1251
1252      // rsGetElementAtYuv_uchar_U()
1253      "_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj",
1254
1255      // rsGetElementAtYuv_uchar_V()
1256      "_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj",
1257    };
1258
1259    for (auto FI : Funcs) {
1260      llvm::Function *Function = Module.getFunction(FI);
1261
1262      if (!Function) {
1263        ALOGE("Missing run-time function '%s'", FI);
1264        return true;
1265      }
1266
1267      if (Function->getNumUses() > 0) {
1268        return true;
1269      }
1270    }
1271
1272    return false;
1273  }
1274
1275  /// @brief Connect RenderScript TBAA metadata to C/C++ metadata
1276  ///
1277  /// The TBAA metadata used to annotate loads/stores from RenderScript
1278  /// Allocations is generated in a separate TBAA tree with a
1279  /// "RenderScript Distinct TBAA" root node. LLVM does assume may-alias for
1280  /// all nodes in unrelated alias analysis trees. This function makes the
1281  /// "RenderScript TBAA" node (which is parented by the Distinct TBAA root),
1282  /// a subtree of the normal C/C++ TBAA tree aside of normal C/C++ types. With
1283  /// the connected trees every access to an Allocation is resolved to
1284  /// must-alias if compared to a normal C/C++ access.
1285  void connectRenderScriptTBAAMetadata(llvm::Module &Module) {
1286    llvm::MDBuilder MDHelper(*Context);
1287    llvm::MDNode *TBAARenderScriptDistinct =
1288      MDHelper.createTBAARoot("RenderScript Distinct TBAA");
1289    llvm::MDNode *TBAARenderScript = MDHelper.createTBAANode(
1290        "RenderScript TBAA", TBAARenderScriptDistinct);
1291    llvm::MDNode *TBAARoot     = MDHelper.createTBAARoot("Simple C/C++ TBAA");
1292    TBAARenderScript->replaceOperandWith(1, TBAARoot);
1293  }
1294
1295  virtual bool runOnModule(llvm::Module &Module) {
1296    bool Changed  = false;
1297    this->Module  = &Module;
1298    Context = &Module.getContext();
1299
1300    buildTypes();
1301
1302    bcinfo::MetadataExtractor me(&Module);
1303    if (!me.extract()) {
1304      ALOGE("Could not extract metadata from module!");
1305      return false;
1306    }
1307
1308    // Expand forEach_* style kernels.
1309    mExportForEachCount = me.getExportForEachSignatureCount();
1310    mExportForEachNameList = me.getExportForEachNameList();
1311    mExportForEachSignatureList = me.getExportForEachSignatureList();
1312
1313    for (size_t i = 0; i < mExportForEachCount; ++i) {
1314      const char *name = mExportForEachNameList[i];
1315      uint32_t signature = mExportForEachSignatureList[i];
1316      llvm::Function *kernel = Module.getFunction(name);
1317      if (kernel) {
1318        if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) {
1319          Changed |= ExpandForEach(kernel, signature);
1320          kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
1321        } else if (kernel->getReturnType()->isVoidTy()) {
1322          Changed |= ExpandOldStyleForEach(kernel, signature);
1323          kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
1324        } else {
1325          // There are some graphics root functions that are not
1326          // expanded, but that will be called directly. For those
1327          // functions, we can not set the linkage to internal.
1328        }
1329      }
1330    }
1331
1332    // Expand reduce_* style kernels.
1333    mExportReduceCount = me.getExportReduceCount();
1334    mExportReduceNameList = me.getExportReduceNameList();
1335
1336    for (size_t i = 0; i < mExportReduceCount; ++i) {
1337      llvm::Function *kernel = Module.getFunction(mExportReduceNameList[i]);
1338      if (kernel) {
1339        Changed |= ExpandReduce(kernel);
1340      }
1341    }
1342
1343    if (gEnableRsTbaa && !allocPointersExposed(Module)) {
1344      connectRenderScriptTBAAMetadata(Module);
1345    }
1346
1347    return Changed;
1348  }
1349
1350  virtual const char *getPassName() const {
1351    return "forEach_* and reduce_* function expansion";
1352  }
1353
1354}; // end RSKernelExpandPass
1355
1356} // end anonymous namespace
1357
1358char RSKernelExpandPass::ID = 0;
1359static llvm::RegisterPass<RSKernelExpandPass> X("kernelexp", "Kernel Expand Pass");
1360
1361namespace bcc {
1362
1363llvm::ModulePass *
1364createRSKernelExpandPass(bool pEnableStepOpt) {
1365  return new RSKernelExpandPass(pEnableStepOpt);
1366}
1367
1368} // end namespace bcc
1369