RSForEachExpand.cpp revision 354d1c132ad7e1ff6fdb0da95443245848a0601f
1/*
2 * Copyright 2012, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "bcc/Assert.h"
18#include "bcc/Renderscript/RSTransforms.h"
19
20#include <cstdlib>
21#include <functional>
22
23#include <llvm/IR/DerivedTypes.h>
24#include <llvm/IR/Function.h>
25#include <llvm/IR/Instructions.h>
26#include <llvm/IR/IRBuilder.h>
27#include <llvm/IR/MDBuilder.h>
28#include <llvm/IR/Module.h>
29#include <llvm/Pass.h>
30#include <llvm/Support/raw_ostream.h>
31#include <llvm/IR/DataLayout.h>
32#include <llvm/IR/Function.h>
33#include <llvm/IR/Type.h>
34#include <llvm/Transforms/Utils/BasicBlockUtils.h>
35
36#include "bcc/Config/Config.h"
37#include "bcc/Support/Log.h"
38
39#include "bcinfo/MetadataExtractor.h"
40
41#define NUM_EXPANDED_FUNCTION_PARAMS 4
42
43using namespace bcc;
44
45namespace {
46
47static const bool gEnableRsTbaa = true;
48
49/* RSForEachExpandPass - This pass operates on functions that are able to be
50 * called via rsForEach() or "foreach_<NAME>". We create an inner loop for the
51 * ForEach-able function to be invoked over the appropriate data cells of the
52 * input/output allocations (adjusting other relevant parameters as we go). We
53 * support doing this for any ForEach-able compute kernels. The new function
54 * name is the original function name followed by ".expand". Note that we
55 * still generate code for the original function.
56 */
57class RSForEachExpandPass : public llvm::ModulePass {
58public:
59  static char ID;
60
61private:
62  static const size_t RS_KERNEL_INPUT_LIMIT = 8; // see frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h
63
64  enum RsLaunchDimensionsField {
65    RsLaunchDimensionsFieldX,
66    RsLaunchDimensionsFieldY,
67    RsLaunchDimensionsFieldZ,
68    RsLaunchDimensionsFieldLod,
69    RsLaunchDimensionsFieldFace,
70    RsLaunchDimensionsFieldArray,
71
72    RsLaunchDimensionsFieldCount
73  };
74
75  enum RsExpandKernelDriverInfoPfxField {
76    RsExpandKernelDriverInfoPfxFieldInPtr,
77    RsExpandKernelDriverInfoPfxFieldInStride,
78    RsExpandKernelDriverInfoPfxFieldInLen,
79    RsExpandKernelDriverInfoPfxFieldOutPtr,
80    RsExpandKernelDriverInfoPfxFieldOutStride,
81    RsExpandKernelDriverInfoPfxFieldOutLen,
82    RsExpandKernelDriverInfoPfxFieldDim,
83    RsExpandKernelDriverInfoPfxFieldCurrent,
84    RsExpandKernelDriverInfoPfxFieldUsr,
85    RsExpandKernelDriverInfoPfxFieldUsLenr,
86
87    RsExpandKernelDriverInfoPfxFieldCount
88  };
89
90  llvm::Module *Module;
91  llvm::LLVMContext *Context;
92
93  /*
94   * Pointer to LLVM type information for the the function signature
95   * for expanded kernels.  This must be re-calculated for each
96   * module the pass is run on.
97   */
98  llvm::FunctionType *ExpandedFunctionType;
99
100  uint32_t mExportForEachCount;
101  const char **mExportForEachNameList;
102  const uint32_t *mExportForEachSignatureList;
103
104  // Turns on optimization of allocation stride values.
105  bool mEnableStepOpt;
106
107  uint32_t getRootSignature(llvm::Function *Function) {
108    const llvm::NamedMDNode *ExportForEachMetadata =
109        Module->getNamedMetadata("#rs_export_foreach");
110
111    if (!ExportForEachMetadata) {
112      llvm::SmallVector<llvm::Type*, 8> RootArgTys;
113      for (llvm::Function::arg_iterator B = Function->arg_begin(),
114                                        E = Function->arg_end();
115           B != E;
116           ++B) {
117        RootArgTys.push_back(B->getType());
118      }
119
120      // For pre-ICS bitcode, we may not have signature information. In that
121      // case, we use the size of the RootArgTys to select the number of
122      // arguments.
123      return (1 << RootArgTys.size()) - 1;
124    }
125
126    if (ExportForEachMetadata->getNumOperands() == 0) {
127      return 0;
128    }
129
130    bccAssert(ExportForEachMetadata->getNumOperands() > 0);
131
132    // We only handle the case for legacy root() functions here, so this is
133    // hard-coded to look at only the first such function.
134    llvm::MDNode *SigNode = ExportForEachMetadata->getOperand(0);
135    if (SigNode != nullptr && SigNode->getNumOperands() == 1) {
136      llvm::Metadata *SigMD = SigNode->getOperand(0);
137      if (llvm::MDString *SigS = llvm::dyn_cast<llvm::MDString>(SigMD)) {
138        llvm::StringRef SigString = SigS->getString();
139        uint32_t Signature = 0;
140        if (SigString.getAsInteger(10, Signature)) {
141          ALOGE("Non-integer signature value '%s'", SigString.str().c_str());
142          return 0;
143        }
144        return Signature;
145      }
146    }
147
148    return 0;
149  }
150
151  bool isStepOptSupported(llvm::Type *AllocType) {
152
153    llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
154    llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context);
155
156    if (mEnableStepOpt) {
157      return false;
158    }
159
160    if (AllocType == VoidPtrTy) {
161      return false;
162    }
163
164    if (!PT) {
165      return false;
166    }
167
168    // remaining conditions are 64-bit only
169    if (VoidPtrTy->getPrimitiveSizeInBits() == 32) {
170      return true;
171    }
172
173    // coerce suggests an upconverted struct type, which we can't support
174    if (AllocType->getStructName().find("coerce") != llvm::StringRef::npos) {
175      return false;
176    }
177
178    // 2xi64 and i128 suggest an upconverted struct type, which are also unsupported
179    llvm::Type *V2xi64Ty = llvm::VectorType::get(llvm::Type::getInt64Ty(*Context), 2);
180    llvm::Type *Int128Ty = llvm::Type::getIntNTy(*Context, 128);
181    if (AllocType == V2xi64Ty || AllocType == Int128Ty) {
182      return false;
183    }
184
185    return true;
186  }
187
188  // Get the actual value we should use to step through an allocation.
189  //
190  // Normally the value we use to step through an allocation is given to us by
191  // the driver. However, for certain primitive data types, we can derive an
192  // integer constant for the step value. We use this integer constant whenever
193  // possible to allow further compiler optimizations to take place.
194  //
195  // DL - Target Data size/layout information.
196  // T - Type of allocation (should be a pointer).
197  // OrigStep - Original step increment (root.expand() input from driver).
198  llvm::Value *getStepValue(llvm::DataLayout *DL, llvm::Type *AllocType,
199                            llvm::Value *OrigStep) {
200    bccAssert(DL);
201    bccAssert(AllocType);
202    bccAssert(OrigStep);
203    llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
204    if (isStepOptSupported(AllocType)) {
205      llvm::Type *ET = PT->getElementType();
206      uint64_t ETSize = DL->getTypeAllocSize(ET);
207      llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
208      return llvm::ConstantInt::get(Int32Ty, ETSize);
209    } else {
210      return OrigStep;
211    }
212  }
213
214  /// Builds the types required by the pass for the given context.
215  void buildTypes(void) {
216    // Create the RsLaunchDimensionsTy and RsExpandKernelDriverInfoPfxTy structs.
217
218    llvm::Type *Int8Ty                   = llvm::Type::getInt8Ty(*Context);
219    llvm::Type *Int8PtrTy                = Int8Ty->getPointerTo();
220    llvm::Type *Int8PtrArrayInputLimitTy = llvm::ArrayType::get(Int8PtrTy, RS_KERNEL_INPUT_LIMIT);
221    llvm::Type *Int32Ty                  = llvm::Type::getInt32Ty(*Context);
222    llvm::Type *Int32ArrayInputLimitTy   = llvm::ArrayType::get(Int32Ty, RS_KERNEL_INPUT_LIMIT);
223    llvm::Type *VoidPtrTy                = llvm::Type::getInt8PtrTy(*Context);
224    llvm::Type *Int32Array4Ty            = llvm::ArrayType::get(Int32Ty, 4);
225
226    /* Defined in frameworks/base/libs/rs/cpu_ref/rsCpuCore.h:
227     *
228     * struct RsLaunchDimensions {
229     *   uint32_t x;
230     *   uint32_t y;
231     *   uint32_t z;
232     *   uint32_t lod;
233     *   uint32_t face;
234     *   uint32_t array[4];
235     * };
236     */
237    llvm::SmallVector<llvm::Type*, RsLaunchDimensionsFieldCount> RsLaunchDimensionsTypes;
238    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t x
239    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t y
240    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t z
241    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t lod
242    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t face
243    RsLaunchDimensionsTypes.push_back(Int32Array4Ty); // uint32_t array[4]
244    llvm::StructType *RsLaunchDimensionsTy =
245        llvm::StructType::create(RsLaunchDimensionsTypes, "RsLaunchDimensions");
246
247    /* Defined as the beginning of RsExpandKernelDriverInfo in frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h:
248     *
249     * struct RsExpandKernelDriverInfoPfx {
250     *     const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT];
251     *     uint32_t inStride[RS_KERNEL_INPUT_LIMIT];
252     *     uint32_t inLen;
253     *
254     *     uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT];
255     *     uint32_t outStride[RS_KERNEL_INPUT_LIMIT];
256     *     uint32_t outLen;
257     *
258     *     // Dimension of the launch
259     *     RsLaunchDimensions dim;
260     *
261     *     // The walking iterator of the launch
262     *     RsLaunchDimensions current;
263     *
264     *     const void *usr;
265     *     uint32_t usrLen;
266     *
267     *     // Items below this line are not used by the compiler and can be change in the driver.
268     *     // So the compiler must assume there are an unknown number of fields of unknown type
269     *     // beginning here.
270     * };
271     *
272     * The name "RsExpandKernelDriverInfoPfx" is known to RSInvariantPass (RSInvariant.cpp).
273     */
274    llvm::SmallVector<llvm::Type*, RsExpandKernelDriverInfoPfxFieldCount> RsExpandKernelDriverInfoPfxTypes;
275    RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT]
276    RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy);   // uint32_t inStride[RS_KERNEL_INPUT_LIMIT]
277    RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t inLen
278    RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT]
279    RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy);   // uint32_t outStride[RS_KERNEL_INPUT_LIMIT]
280    RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t outLen
281    RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy);     // RsLaunchDimensions dim
282    RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy);     // RsLaunchDimensions current
283    RsExpandKernelDriverInfoPfxTypes.push_back(VoidPtrTy);                // const void *usr
284    RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t usrLen
285    llvm::StructType *RsExpandKernelDriverInfoPfxTy =
286        llvm::StructType::create(RsExpandKernelDriverInfoPfxTypes, "RsExpandKernelDriverInfoPfx");
287
288    // Create the function type for expanded kernels.
289
290    llvm::Type *RsExpandKernelDriverInfoPfxPtrTy = RsExpandKernelDriverInfoPfxTy->getPointerTo();
291
292    llvm::SmallVector<llvm::Type*, 8> ParamTypes;
293    ParamTypes.push_back(RsExpandKernelDriverInfoPfxPtrTy); // const RsExpandKernelDriverInfoPfx *p
294    ParamTypes.push_back(Int32Ty);                          // uint32_t x1
295    ParamTypes.push_back(Int32Ty);                          // uint32_t x2
296    ParamTypes.push_back(Int32Ty);                          // uint32_t outstep
297
298    ExpandedFunctionType =
299        llvm::FunctionType::get(llvm::Type::getVoidTy(*Context), ParamTypes,
300                                false);
301  }
302
303  /// @brief Create skeleton of the expanded function.
304  ///
305  /// This creates a function with the following signature:
306  ///
307  ///   void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2,
308  ///         uint32_t outstep)
309  ///
310  llvm::Function *createEmptyExpandedFunction(llvm::StringRef OldName) {
311    llvm::Function *ExpandedFunction =
312      llvm::Function::Create(ExpandedFunctionType,
313                             llvm::GlobalValue::ExternalLinkage,
314                             OldName + ".expand", Module);
315
316    bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
317
318    llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
319
320    (AI++)->setName("p");
321    (AI++)->setName("x1");
322    (AI++)->setName("x2");
323    (AI++)->setName("arg_outstep");
324
325    llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
326                                                       ExpandedFunction);
327    llvm::IRBuilder<> Builder(Begin);
328    Builder.CreateRetVoid();
329
330    return ExpandedFunction;
331  }
332
333  /// @brief Create an empty loop
334  ///
335  /// Create a loop of the form:
336  ///
337  /// for (i = LowerBound; i < UpperBound; i++)
338  ///   ;
339  ///
340  /// After the loop has been created, the builder is set such that
341  /// instructions can be added to the loop body.
342  ///
343  /// @param Builder The builder to use to build this loop. The current
344  ///                position of the builder is the position the loop
345  ///                will be inserted.
346  /// @param LowerBound The first value of the loop iterator
347  /// @param UpperBound The maximal value of the loop iterator
348  /// @param LoopIV A reference that will be set to the loop iterator.
349  /// @return The BasicBlock that will be executed after the loop.
350  llvm::BasicBlock *createLoop(llvm::IRBuilder<> &Builder,
351                               llvm::Value *LowerBound,
352                               llvm::Value *UpperBound,
353                               llvm::PHINode **LoopIV) {
354    assert(LowerBound->getType() == UpperBound->getType());
355
356    llvm::BasicBlock *CondBB, *AfterBB, *HeaderBB;
357    llvm::Value *Cond, *IVNext;
358    llvm::PHINode *IV;
359
360    CondBB = Builder.GetInsertBlock();
361    // DT = &getAnalysis<DominatorTree>();
362    // LI = &getAnalysis<LoopInfo>();
363    AfterBB = llvm::SplitBlock(CondBB, Builder.GetInsertPoint(), nullptr, nullptr);
364    HeaderBB = llvm::BasicBlock::Create(*Context, "Loop", CondBB->getParent());
365
366    // if (LowerBound < Upperbound)
367    //   goto LoopHeader
368    // else
369    //   goto AfterBB
370    CondBB->getTerminator()->eraseFromParent();
371    Builder.SetInsertPoint(CondBB);
372    Cond = Builder.CreateICmpULT(LowerBound, UpperBound);
373    Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
374
375    // iv = PHI [CondBB -> LowerBound], [LoopHeader -> NextIV ]
376    // iv.next = iv + 1
377    // if (iv.next < Upperbound)
378    //   goto LoopHeader
379    // else
380    //   goto AfterBB
381    Builder.SetInsertPoint(HeaderBB);
382    IV = Builder.CreatePHI(LowerBound->getType(), 2, "X");
383    IV->addIncoming(LowerBound, CondBB);
384    IVNext = Builder.CreateNUWAdd(IV, Builder.getInt32(1));
385    IV->addIncoming(IVNext, HeaderBB);
386    Cond = Builder.CreateICmpULT(IVNext, UpperBound);
387    Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
388    AfterBB->setName("Exit");
389    Builder.SetInsertPoint(HeaderBB->getFirstNonPHI());
390    *LoopIV = IV;
391    return AfterBB;
392  }
393
394public:
395  RSForEachExpandPass(bool pEnableStepOpt = true)
396      : ModulePass(ID), Module(nullptr), Context(nullptr),
397        mEnableStepOpt(pEnableStepOpt) {
398
399  }
400
401  virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
402    // This pass does not use any other analysis passes, but it does
403    // add/wrap the existing functions in the module (thus altering the CFG).
404  }
405
406  // Build contribution to outgoing argument list for calling a
407  // ForEach-able function, based on the special parameters of that
408  // function.
409  //
410  // Signature - metadata bits for the signature of the ForEach-able function
411  // X, Arg_p - values derived directly from expanded function,
412  //            suitable for computing arguments for the ForEach-able function
413  // CalleeArgs - contribution is accumulated here
414  // Bump - invoked once for each contributed outgoing argument
415  void ExpandSpecialArguments(uint32_t Signature,
416                              llvm::Value *X,
417                              llvm::Value *Arg_p,
418                              llvm::IRBuilder<> &Builder,
419                              llvm::SmallVector<llvm::Value*, 8> &CalleeArgs,
420                              std::function<void ()> Bump) {
421
422    if (bcinfo::MetadataExtractor::hasForEachSignatureCtxt(Signature)) {
423      CalleeArgs.push_back(Arg_p);
424      Bump();
425    }
426
427    if (bcinfo::MetadataExtractor::hasForEachSignatureX(Signature)) {
428      CalleeArgs.push_back(X);
429      Bump();
430    }
431
432    if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature) ||
433        bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
434
435      llvm::Value *Current = Builder.CreateStructGEP(Arg_p, RsExpandKernelDriverInfoPfxFieldCurrent);
436
437      if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature)) {
438        llvm::Value *Y = Builder.CreateLoad(
439            Builder.CreateStructGEP(Current, RsLaunchDimensionsFieldY), "Y");
440        CalleeArgs.push_back(Y);
441        Bump();
442      }
443
444      if (bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
445        llvm::Value *Z = Builder.CreateLoad(
446            Builder.CreateStructGEP(Current, RsLaunchDimensionsFieldZ), "Z");
447        CalleeArgs.push_back(Z);
448        Bump();
449      }
450    }
451  }
452
453  /* Performs the actual optimization on a selected function. On success, the
454   * Module will contain a new function of the name "<NAME>.expand" that
455   * invokes <NAME>() in a loop with the appropriate parameters.
456   */
457  bool ExpandFunction(llvm::Function *Function, uint32_t Signature) {
458    ALOGV("Expanding ForEach-able Function %s",
459          Function->getName().str().c_str());
460
461    if (!Signature) {
462      Signature = getRootSignature(Function);
463      if (!Signature) {
464        // We couldn't determine how to expand this function based on its
465        // function signature.
466        return false;
467      }
468    }
469
470    llvm::DataLayout DL(Module);
471
472    llvm::Function *ExpandedFunction =
473      createEmptyExpandedFunction(Function->getName());
474
475    /*
476     * Extract the expanded function's parameters.  It is guaranteed by
477     * createEmptyExpandedFunction that there will be five parameters.
478     */
479
480    bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
481
482    llvm::Function::arg_iterator ExpandedFunctionArgIter =
483      ExpandedFunction->arg_begin();
484
485    llvm::Value *Arg_p       = &*(ExpandedFunctionArgIter++);
486    llvm::Value *Arg_x1      = &*(ExpandedFunctionArgIter++);
487    llvm::Value *Arg_x2      = &*(ExpandedFunctionArgIter++);
488    llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter);
489
490    llvm::Value *InStep  = nullptr;
491    llvm::Value *OutStep = nullptr;
492
493    // Construct the actual function body.
494    llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
495
496    // Collect and construct the arguments for the kernel().
497    // Note that we load any loop-invariant arguments before entering the Loop.
498    llvm::Function::arg_iterator FunctionArgIter = Function->arg_begin();
499
500    llvm::Type  *InTy      = nullptr;
501    llvm::Value *InBasePtr = nullptr;
502    if (bcinfo::MetadataExtractor::hasForEachSignatureIn(Signature)) {
503      llvm::Value *InsBasePtr  = Builder.CreateStructGEP(Arg_p, RsExpandKernelDriverInfoPfxFieldInPtr, "inputs_base");
504
505      llvm::Value *InStepsBase = Builder.CreateStructGEP(Arg_p, RsExpandKernelDriverInfoPfxFieldInStride, "insteps_base");
506
507      llvm::Value    *InStepAddr = Builder.CreateConstInBoundsGEP2_32(InStepsBase, 0, 0);
508      llvm::LoadInst *InStepArg  = Builder.CreateLoad(InStepAddr,
509                                                      "instep_addr");
510
511      InTy = (FunctionArgIter++)->getType();
512      InStep = getStepValue(&DL, InTy, InStepArg);
513
514      InStep->setName("instep");
515
516      llvm::Value *InputAddr = Builder.CreateConstInBoundsGEP2_32(InsBasePtr, 0, 0);
517      InBasePtr = Builder.CreateLoad(InputAddr, "input_base");
518    }
519
520    llvm::Type *OutTy = nullptr;
521    llvm::Value *OutBasePtr = nullptr;
522    if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
523      OutTy = (FunctionArgIter++)->getType();
524      OutStep = getStepValue(&DL, OutTy, Arg_outstep);
525      OutStep->setName("outstep");
526      OutBasePtr = Builder.CreateLoad(
527                     Builder.CreateConstInBoundsGEP2_32(
528                         Builder.CreateStructGEP(Arg_p, RsExpandKernelDriverInfoPfxFieldOutPtr), 0, 0));
529    }
530
531    llvm::Value *UsrData = nullptr;
532    if (bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)) {
533      llvm::Type *UsrDataTy = (FunctionArgIter++)->getType();
534      UsrData = Builder.CreatePointerCast(Builder.CreateLoad(
535          Builder.CreateStructGEP(Arg_p,  RsExpandKernelDriverInfoPfxFieldUsr)), UsrDataTy);
536      UsrData->setName("UsrData");
537    }
538
539    llvm::PHINode *IV;
540    createLoop(Builder, Arg_x1, Arg_x2, &IV);
541
542    llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
543    ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
544                           [&FunctionArgIter]() { FunctionArgIter++; });
545
546    bccAssert(FunctionArgIter == Function->arg_end());
547
548    // Populate the actual call to kernel().
549    llvm::SmallVector<llvm::Value*, 8> RootArgs;
550
551    llvm::Value *InPtr  = nullptr;
552    llvm::Value *OutPtr = nullptr;
553
554    // Calculate the current input and output pointers
555    //
556    // We always calculate the input/output pointers with a GEP operating on i8
557    // values and only cast at the very end to OutTy. This is because the step
558    // between two values is given in bytes.
559    //
560    // TODO: We could further optimize the output by using a GEP operation of
561    // type 'OutTy' in cases where the element type of the allocation allows.
562    if (OutBasePtr) {
563      llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
564      OutOffset = Builder.CreateMul(OutOffset, OutStep);
565      OutPtr = Builder.CreateGEP(OutBasePtr, OutOffset);
566      OutPtr = Builder.CreatePointerCast(OutPtr, OutTy);
567    }
568
569    if (InBasePtr) {
570      llvm::Value *InOffset = Builder.CreateSub(IV, Arg_x1);
571      InOffset = Builder.CreateMul(InOffset, InStep);
572      InPtr = Builder.CreateGEP(InBasePtr, InOffset);
573      InPtr = Builder.CreatePointerCast(InPtr, InTy);
574    }
575
576    if (InPtr) {
577      RootArgs.push_back(InPtr);
578    }
579
580    if (OutPtr) {
581      RootArgs.push_back(OutPtr);
582    }
583
584    if (UsrData) {
585      RootArgs.push_back(UsrData);
586    }
587
588    RootArgs.append(CalleeArgs.begin(), CalleeArgs.end());
589
590    Builder.CreateCall(Function, RootArgs);
591
592    return true;
593  }
594
595  /* Expand a pass-by-value kernel.
596   */
597  bool ExpandKernel(llvm::Function *Function, uint32_t Signature) {
598    bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature));
599    ALOGV("Expanding kernel Function %s", Function->getName().str().c_str());
600
601    // TODO: Refactor this to share functionality with ExpandFunction.
602    llvm::DataLayout DL(Module);
603
604    llvm::Function *ExpandedFunction =
605      createEmptyExpandedFunction(Function->getName());
606
607    /*
608     * Extract the expanded function's parameters.  It is guaranteed by
609     * createEmptyExpandedFunction that there will be five parameters.
610     */
611
612    bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
613
614    llvm::Function::arg_iterator ExpandedFunctionArgIter =
615      ExpandedFunction->arg_begin();
616
617    llvm::Value *Arg_p       = &*(ExpandedFunctionArgIter++);
618    llvm::Value *Arg_x1      = &*(ExpandedFunctionArgIter++);
619    llvm::Value *Arg_x2      = &*(ExpandedFunctionArgIter++);
620    llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter);
621
622    // Construct the actual function body.
623    llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
624
625    // Create TBAA meta-data.
626    llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript,
627                 *TBAAAllocation, *TBAAPointer;
628    llvm::MDBuilder MDHelper(*Context);
629
630    TBAARenderScriptDistinct =
631      MDHelper.createTBAARoot("RenderScript Distinct TBAA");
632    TBAARenderScript = MDHelper.createTBAANode("RenderScript TBAA",
633        TBAARenderScriptDistinct);
634    TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
635                                                       TBAARenderScript);
636    TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
637                                                      TBAAAllocation, 0);
638    TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer",
639                                                    TBAARenderScript);
640    TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0);
641
642    llvm::MDNode *AliasingDomain, *AliasingScope;
643    AliasingDomain = MDHelper.createAnonymousAliasScopeDomain("RS argument scope domain");
644    AliasingScope = MDHelper.createAnonymousAliasScope(AliasingDomain, "RS argument scope");
645
646    /*
647     * Collect and construct the arguments for the kernel().
648     *
649     * Note that we load any loop-invariant arguments before entering the Loop.
650     */
651    size_t NumInputs = Function->arg_size();
652
653    // No usrData parameter on kernels.
654    bccAssert(
655        !bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature));
656
657    llvm::Function::arg_iterator ArgIter = Function->arg_begin();
658
659    // Check the return type
660    llvm::Type     *OutTy            = nullptr;
661    llvm::Value    *OutStep          = nullptr;
662    llvm::LoadInst *OutBasePtr       = nullptr;
663    llvm::Value    *CastedOutBasePtr = nullptr;
664
665    bool PassOutByPointer = false;
666
667    if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
668      llvm::Type *OutBaseTy = Function->getReturnType();
669
670      if (OutBaseTy->isVoidTy()) {
671        PassOutByPointer = true;
672        OutTy = ArgIter->getType();
673
674        ArgIter++;
675        --NumInputs;
676      } else {
677        // We don't increment Args, since we are using the actual return type.
678        OutTy = OutBaseTy->getPointerTo();
679      }
680
681      OutStep = getStepValue(&DL, OutTy, Arg_outstep);
682      OutStep->setName("outstep");
683      OutBasePtr = Builder.CreateLoad(
684                     Builder.CreateConstInBoundsGEP2_32(
685                         Builder.CreateStructGEP(Arg_p, RsExpandKernelDriverInfoPfxFieldOutPtr), 0, 0));
686
687      if (gEnableRsTbaa) {
688        OutBasePtr->setMetadata("tbaa", TBAAPointer);
689      }
690
691      OutBasePtr->setMetadata("alias.scope", AliasingScope);
692
693      CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out");
694    }
695
696    llvm::PHINode *IV;
697    createLoop(Builder, Arg_x1, Arg_x2, &IV);
698
699    llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
700    ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
701                           [&NumInputs]() { --NumInputs; });
702
703    llvm::SmallVector<llvm::Type*,  8> InTypes;
704    llvm::SmallVector<llvm::Value*, 8> InSteps;
705    llvm::SmallVector<llvm::Value*, 8> InBasePtrs;
706    llvm::SmallVector<bool,         8> InIsStructPointer;
707
708    bccAssert(NumInputs <= RS_KERNEL_INPUT_LIMIT);
709
710    if (NumInputs > 0) {
711      llvm::Value *InsBasePtr  = Builder.CreateStructGEP(Arg_p, RsExpandKernelDriverInfoPfxFieldInPtr, "inputs_base");
712
713      llvm::Value *InStepsBase = Builder.CreateStructGEP(Arg_p, RsExpandKernelDriverInfoPfxFieldInStride, "insteps_base");
714
715      for (size_t InputIndex = 0; InputIndex < NumInputs;
716           ++InputIndex, ArgIter++) {
717
718          llvm::Value    *InStepAddr = Builder.CreateConstInBoundsGEP2_32(InStepsBase, 0, InputIndex);
719          llvm::LoadInst *InStepArg  = Builder.CreateLoad(InStepAddr,
720                                                          "instep_addr");
721
722          llvm::Type *InType = ArgIter->getType();
723
724        /*
725         * AArch64 calling dictate that structs of sufficient size get passed by
726         * pointer instead of passed by value.  This, combined with the fact
727         * that we don't allow kernels to operate on pointer data means that if
728         * we see a kernel with a pointer parameter we know that it is struct
729         * input that has been promoted.  As such we don't need to convert its
730         * type to a pointer.  Later we will need to know to avoid a load, so we
731         * save this information in InIsStructPointer.
732         */
733          if (!InType->isPointerTy()) {
734            InType = InType->getPointerTo();
735            InIsStructPointer.push_back(false);
736          } else {
737            InIsStructPointer.push_back(true);
738          }
739
740          llvm::Value *InStep = getStepValue(&DL, InType, InStepArg);
741
742          InStep->setName("instep");
743
744          llvm::Value    *InputAddr = Builder.CreateConstInBoundsGEP2_32(InsBasePtr, 0, InputIndex);
745          llvm::LoadInst *InBasePtr = Builder.CreateLoad(InputAddr,
746                                                         "input_base");
747          llvm::Value    *CastInBasePtr = Builder.CreatePointerCast(InBasePtr,
748                                                                    InType, "casted_in");
749          if (gEnableRsTbaa) {
750            InBasePtr->setMetadata("tbaa", TBAAPointer);
751          }
752
753          InBasePtr->setMetadata("alias.scope", AliasingScope);
754
755          InTypes.push_back(InType);
756          InSteps.push_back(InStep);
757          InBasePtrs.push_back(CastInBasePtr);
758      }
759    }
760
761    // Populate the actual call to kernel().
762    llvm::SmallVector<llvm::Value*, 8> RootArgs;
763
764    // Calculate the current input and output pointers
765    //
766    //
767    // We always calculate the input/output pointers with a GEP operating on i8
768    // values combined with a multiplication and only cast at the very end to
769    // OutTy.  This is to account for dynamic stepping sizes when the value
770    // isn't apparent at compile time.  In the (very common) case when we know
771    // the step size at compile time, due to haveing complete type information
772    // this multiplication will optmized out and produces code equivalent to a
773    // a GEP on a pointer of the correct type.
774
775    // Output
776
777    llvm::Value *OutPtr = nullptr;
778    if (CastedOutBasePtr) {
779      llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
780
781      OutPtr    = Builder.CreateGEP(CastedOutBasePtr, OutOffset);
782
783      if (PassOutByPointer) {
784        RootArgs.push_back(OutPtr);
785      }
786    }
787
788    // Inputs
789
790    if (NumInputs > 0) {
791      llvm::Value *Offset = Builder.CreateSub(IV, Arg_x1);
792
793      for (size_t Index = 0; Index < NumInputs; ++Index) {
794        llvm::Value *InPtr    = Builder.CreateGEP(InBasePtrs[Index], Offset);
795        llvm::Value *Input;
796
797        if (InIsStructPointer[Index]) {
798          Input = InPtr;
799
800        } else {
801          llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input");
802
803          if (gEnableRsTbaa) {
804            InputLoad->setMetadata("tbaa", TBAAAllocation);
805          }
806
807          InputLoad->setMetadata("alias.scope", AliasingScope);
808
809          Input = InputLoad;
810        }
811
812        RootArgs.push_back(Input);
813      }
814    }
815
816    RootArgs.append(CalleeArgs.begin(), CalleeArgs.end());
817
818    llvm::Value *RetVal = Builder.CreateCall(Function, RootArgs);
819
820    if (OutPtr && !PassOutByPointer) {
821      llvm::StoreInst *Store = Builder.CreateStore(RetVal, OutPtr);
822      if (gEnableRsTbaa) {
823        Store->setMetadata("tbaa", TBAAAllocation);
824      }
825      Store->setMetadata("alias.scope", AliasingScope);
826    }
827
828    return true;
829  }
830
831  /// @brief Checks if pointers to allocation internals are exposed
832  ///
833  /// This function verifies if through the parameters passed to the kernel
834  /// or through calls to the runtime library the script gains access to
835  /// pointers pointing to data within a RenderScript Allocation.
836  /// If we know we control all loads from and stores to data within
837  /// RenderScript allocations and if we know the run-time internal accesses
838  /// are all annotated with RenderScript TBAA metadata, only then we
839  /// can safely use TBAA to distinguish between generic and from-allocation
840  /// pointers.
841  bool allocPointersExposed(llvm::Module &Module) {
842    // Old style kernel function can expose pointers to elements within
843    // allocations.
844    // TODO: Extend analysis to allow simple cases of old-style kernels.
845    for (size_t i = 0; i < mExportForEachCount; ++i) {
846      const char *Name = mExportForEachNameList[i];
847      uint32_t Signature = mExportForEachSignatureList[i];
848      if (Module.getFunction(Name) &&
849          !bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)) {
850        return true;
851      }
852    }
853
854    // Check for library functions that expose a pointer to an Allocation or
855    // that are not yet annotated with RenderScript-specific tbaa information.
856    static std::vector<std::string> Funcs;
857
858    // rsGetElementAt(...)
859    Funcs.push_back("_Z14rsGetElementAt13rs_allocationj");
860    Funcs.push_back("_Z14rsGetElementAt13rs_allocationjj");
861    Funcs.push_back("_Z14rsGetElementAt13rs_allocationjjj");
862    // rsSetElementAt()
863    Funcs.push_back("_Z14rsSetElementAt13rs_allocationPvj");
864    Funcs.push_back("_Z14rsSetElementAt13rs_allocationPvjj");
865    Funcs.push_back("_Z14rsSetElementAt13rs_allocationPvjjj");
866    // rsGetElementAtYuv_uchar_Y()
867    Funcs.push_back("_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj");
868    // rsGetElementAtYuv_uchar_U()
869    Funcs.push_back("_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj");
870    // rsGetElementAtYuv_uchar_V()
871    Funcs.push_back("_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj");
872
873    for (std::vector<std::string>::iterator FI = Funcs.begin(),
874                                            FE = Funcs.end();
875         FI != FE; ++FI) {
876      llvm::Function *Function = Module.getFunction(*FI);
877
878      if (!Function) {
879        ALOGE("Missing run-time function '%s'", FI->c_str());
880        return true;
881      }
882
883      if (Function->getNumUses() > 0) {
884        return true;
885      }
886    }
887
888    return false;
889  }
890
891  /// @brief Connect RenderScript TBAA metadata to C/C++ metadata
892  ///
893  /// The TBAA metadata used to annotate loads/stores from RenderScript
894  /// Allocations is generated in a separate TBAA tree with a
895  /// "RenderScript Distinct TBAA" root node. LLVM does assume may-alias for
896  /// all nodes in unrelated alias analysis trees. This function makes the
897  /// "RenderScript TBAA" node (which is parented by the Distinct TBAA root),
898  /// a subtree of the normal C/C++ TBAA tree aside of normal C/C++ types. With
899  /// the connected trees every access to an Allocation is resolved to
900  /// must-alias if compared to a normal C/C++ access.
901  void connectRenderScriptTBAAMetadata(llvm::Module &Module) {
902    llvm::MDBuilder MDHelper(*Context);
903    llvm::MDNode *TBAARenderScriptDistinct =
904      MDHelper.createTBAARoot("RenderScript Distinct TBAA");
905    llvm::MDNode *TBAARenderScript = MDHelper.createTBAANode(
906        "RenderScript TBAA", TBAARenderScriptDistinct);
907    llvm::MDNode *TBAARoot     = MDHelper.createTBAARoot("Simple C/C++ TBAA");
908    TBAARenderScript->replaceOperandWith(1, TBAARoot);
909  }
910
911  virtual bool runOnModule(llvm::Module &Module) {
912    bool Changed  = false;
913    this->Module  = &Module;
914    this->Context = &Module.getContext();
915
916    this->buildTypes();
917
918    bcinfo::MetadataExtractor me(&Module);
919    if (!me.extract()) {
920      ALOGE("Could not extract metadata from module!");
921      return false;
922    }
923    mExportForEachCount = me.getExportForEachSignatureCount();
924    mExportForEachNameList = me.getExportForEachNameList();
925    mExportForEachSignatureList = me.getExportForEachSignatureList();
926
927    bool AllocsExposed = allocPointersExposed(Module);
928
929    for (size_t i = 0; i < mExportForEachCount; ++i) {
930      const char *name = mExportForEachNameList[i];
931      uint32_t signature = mExportForEachSignatureList[i];
932      llvm::Function *kernel = Module.getFunction(name);
933      if (kernel) {
934        if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) {
935          Changed |= ExpandKernel(kernel, signature);
936          kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
937        } else if (kernel->getReturnType()->isVoidTy()) {
938          Changed |= ExpandFunction(kernel, signature);
939          kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
940        } else {
941          // There are some graphics root functions that are not
942          // expanded, but that will be called directly. For those
943          // functions, we can not set the linkage to internal.
944        }
945      }
946    }
947
948    if (gEnableRsTbaa && !AllocsExposed) {
949      connectRenderScriptTBAAMetadata(Module);
950    }
951
952    return Changed;
953  }
954
955  virtual const char *getPassName() const {
956    return "ForEach-able Function Expansion";
957  }
958
959}; // end RSForEachExpandPass
960
961} // end anonymous namespace
962
963char RSForEachExpandPass::ID = 0;
964static llvm::RegisterPass<RSForEachExpandPass> X("foreachexp", "ForEach Expand Pass");
965
966namespace bcc {
967
968llvm::ModulePass *
969createRSForEachExpandPass(bool pEnableStepOpt){
970  return new RSForEachExpandPass(pEnableStepOpt);
971}
972
973} // end namespace bcc
974