RSForEachExpand.cpp revision bc656681ba7ce2a6eae3aded74f632fa2bd103d0
1/*
2 * Copyright 2012, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "bcc/Assert.h"
18#include "bcc/Renderscript/RSTransforms.h"
19
20#include <cstdlib>
21#include <functional>
22
23#include <llvm/IR/DerivedTypes.h>
24#include <llvm/IR/Function.h>
25#include <llvm/IR/Instructions.h>
26#include <llvm/IR/IRBuilder.h>
27#include <llvm/IR/MDBuilder.h>
28#include <llvm/IR/Module.h>
29#include <llvm/Pass.h>
30#include <llvm/Support/raw_ostream.h>
31#include <llvm/IR/DataLayout.h>
32#include <llvm/IR/Function.h>
33#include <llvm/IR/Type.h>
34#include <llvm/Transforms/Utils/BasicBlockUtils.h>
35
36#include "bcc/Config/Config.h"
37#include "bcc/Support/Log.h"
38
39#include "bcinfo/MetadataExtractor.h"
40
41#define NUM_EXPANDED_FUNCTION_PARAMS 4
42
43using namespace bcc;
44
45namespace {
46
47static const bool gEnableRsTbaa = true;
48
49/* RSForEachExpandPass - This pass operates on functions that are able to be
50 * called via rsForEach() or "foreach_<NAME>". We create an inner loop for the
51 * ForEach-able function to be invoked over the appropriate data cells of the
52 * input/output allocations (adjusting other relevant parameters as we go). We
53 * support doing this for any ForEach-able compute kernels. The new function
54 * name is the original function name followed by ".expand". Note that we
55 * still generate code for the original function.
56 */
57class RSForEachExpandPass : public llvm::ModulePass {
58public:
59  static char ID;
60
61private:
62  static const size_t RS_KERNEL_INPUT_LIMIT = 8; // see frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h
63
64  enum RsLaunchDimensionsField {
65    RsLaunchDimensionsFieldX,
66    RsLaunchDimensionsFieldY,
67    RsLaunchDimensionsFieldZ,
68    RsLaunchDimensionsFieldLod,
69    RsLaunchDimensionsFieldFace,
70    RsLaunchDimensionsFieldArray,
71
72    RsLaunchDimensionsFieldCount
73  };
74
75  enum RsExpandKernelDriverInfoPfxField {
76    RsExpandKernelDriverInfoPfxFieldInPtr,
77    RsExpandKernelDriverInfoPfxFieldInStride,
78    RsExpandKernelDriverInfoPfxFieldInLen,
79    RsExpandKernelDriverInfoPfxFieldOutPtr,
80    RsExpandKernelDriverInfoPfxFieldOutStride,
81    RsExpandKernelDriverInfoPfxFieldOutLen,
82    RsExpandKernelDriverInfoPfxFieldDim,
83    RsExpandKernelDriverInfoPfxFieldCurrent,
84    RsExpandKernelDriverInfoPfxFieldUsr,
85    RsExpandKernelDriverInfoPfxFieldUsLenr,
86
87    RsExpandKernelDriverInfoPfxFieldCount
88  };
89
90  llvm::Module *Module;
91  llvm::LLVMContext *Context;
92
93  /*
94   * Pointer to LLVM type information for the the function signature
95   * for expanded kernels.  This must be re-calculated for each
96   * module the pass is run on.
97   */
98  llvm::FunctionType *ExpandedFunctionType;
99
100  uint32_t mExportForEachCount;
101  const char **mExportForEachNameList;
102  const uint32_t *mExportForEachSignatureList;
103
104  // Turns on optimization of allocation stride values.
105  bool mEnableStepOpt;
106
107  uint32_t getRootSignature(llvm::Function *Function) {
108    const llvm::NamedMDNode *ExportForEachMetadata =
109        Module->getNamedMetadata("#rs_export_foreach");
110
111    if (!ExportForEachMetadata) {
112      llvm::SmallVector<llvm::Type*, 8> RootArgTys;
113      for (llvm::Function::arg_iterator B = Function->arg_begin(),
114                                        E = Function->arg_end();
115           B != E;
116           ++B) {
117        RootArgTys.push_back(B->getType());
118      }
119
120      // For pre-ICS bitcode, we may not have signature information. In that
121      // case, we use the size of the RootArgTys to select the number of
122      // arguments.
123      return (1 << RootArgTys.size()) - 1;
124    }
125
126    if (ExportForEachMetadata->getNumOperands() == 0) {
127      return 0;
128    }
129
130    bccAssert(ExportForEachMetadata->getNumOperands() > 0);
131
132    // We only handle the case for legacy root() functions here, so this is
133    // hard-coded to look at only the first such function.
134    llvm::MDNode *SigNode = ExportForEachMetadata->getOperand(0);
135    if (SigNode != nullptr && SigNode->getNumOperands() == 1) {
136      llvm::Metadata *SigMD = SigNode->getOperand(0);
137      if (llvm::MDString *SigS = llvm::dyn_cast<llvm::MDString>(SigMD)) {
138        llvm::StringRef SigString = SigS->getString();
139        uint32_t Signature = 0;
140        if (SigString.getAsInteger(10, Signature)) {
141          ALOGE("Non-integer signature value '%s'", SigString.str().c_str());
142          return 0;
143        }
144        return Signature;
145      }
146    }
147
148    return 0;
149  }
150
151  bool isStepOptSupported(llvm::Type *AllocType) {
152
153    llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
154    llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context);
155
156    if (mEnableStepOpt) {
157      return false;
158    }
159
160    if (AllocType == VoidPtrTy) {
161      return false;
162    }
163
164    if (!PT) {
165      return false;
166    }
167
168    // remaining conditions are 64-bit only
169    if (VoidPtrTy->getPrimitiveSizeInBits() == 32) {
170      return true;
171    }
172
173    // coerce suggests an upconverted struct type, which we can't support
174    if (AllocType->getStructName().find("coerce") != llvm::StringRef::npos) {
175      return false;
176    }
177
178    // 2xi64 and i128 suggest an upconverted struct type, which are also unsupported
179    llvm::Type *V2xi64Ty = llvm::VectorType::get(llvm::Type::getInt64Ty(*Context), 2);
180    llvm::Type *Int128Ty = llvm::Type::getIntNTy(*Context, 128);
181    if (AllocType == V2xi64Ty || AllocType == Int128Ty) {
182      return false;
183    }
184
185    return true;
186  }
187
188  // Get the actual value we should use to step through an allocation.
189  //
190  // Normally the value we use to step through an allocation is given to us by
191  // the driver. However, for certain primitive data types, we can derive an
192  // integer constant for the step value. We use this integer constant whenever
193  // possible to allow further compiler optimizations to take place.
194  //
195  // DL - Target Data size/layout information.
196  // T - Type of allocation (should be a pointer).
197  // OrigStep - Original step increment (root.expand() input from driver).
198  llvm::Value *getStepValue(llvm::DataLayout *DL, llvm::Type *AllocType,
199                            llvm::Value *OrigStep) {
200    bccAssert(DL);
201    bccAssert(AllocType);
202    bccAssert(OrigStep);
203    llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
204    if (isStepOptSupported(AllocType)) {
205      llvm::Type *ET = PT->getElementType();
206      uint64_t ETSize = DL->getTypeAllocSize(ET);
207      llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
208      return llvm::ConstantInt::get(Int32Ty, ETSize);
209    } else {
210      return OrigStep;
211    }
212  }
213
214  /// Builds the types required by the pass for the given context.
215  void buildTypes(void) {
216    // Create the RsLaunchDimensionsTy and RsExpandKernelDriverInfoPfxTy structs.
217
218    llvm::Type *Int8Ty                   = llvm::Type::getInt8Ty(*Context);
219    llvm::Type *Int8PtrTy                = Int8Ty->getPointerTo();
220    llvm::Type *Int8PtrArrayInputLimitTy = llvm::ArrayType::get(Int8PtrTy, RS_KERNEL_INPUT_LIMIT);
221    llvm::Type *Int32Ty                  = llvm::Type::getInt32Ty(*Context);
222    llvm::Type *Int32ArrayInputLimitTy   = llvm::ArrayType::get(Int32Ty, RS_KERNEL_INPUT_LIMIT);
223    llvm::Type *VoidPtrTy                = llvm::Type::getInt8PtrTy(*Context);
224    llvm::Type *Int32Array4Ty            = llvm::ArrayType::get(Int32Ty, 4);
225
226    /* Defined in frameworks/base/libs/rs/cpu_ref/rsCpuCore.h:
227     *
228     * struct RsLaunchDimensions {
229     *   uint32_t x;
230     *   uint32_t y;
231     *   uint32_t z;
232     *   uint32_t lod;
233     *   uint32_t face;
234     *   uint32_t array[4];
235     * };
236     */
237    llvm::SmallVector<llvm::Type*, RsLaunchDimensionsFieldCount> RsLaunchDimensionsTypes;
238    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t x
239    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t y
240    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t z
241    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t lod
242    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t face
243    RsLaunchDimensionsTypes.push_back(Int32Array4Ty); // uint32_t array[4]
244    llvm::StructType *RsLaunchDimensionsTy =
245        llvm::StructType::create(RsLaunchDimensionsTypes, "RsLaunchDimensions");
246
247    /* Defined as the beginning of RsExpandKernelDriverInfo in frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h:
248     *
249     * struct RsExpandKernelDriverInfoPfx {
250     *     const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT];
251     *     uint32_t inStride[RS_KERNEL_INPUT_LIMIT];
252     *     uint32_t inLen;
253     *
254     *     uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT];
255     *     uint32_t outStride[RS_KERNEL_INPUT_LIMIT];
256     *     uint32_t outLen;
257     *
258     *     // Dimension of the launch
259     *     RsLaunchDimensions dim;
260     *
261     *     // The walking iterator of the launch
262     *     RsLaunchDimensions current;
263     *
264     *     const void *usr;
265     *     uint32_t usrLen;
266     *
267     *     // Items below this line are not used by the compiler and can be change in the driver.
268     *     // So the compiler must assume there are an unknown number of fields of unknown type
269     *     // beginning here.
270     * };
271     *
272     * The name "RsExpandKernelDriverInfoPfx" is known to RSInvariantPass (RSInvariant.cpp).
273     */
274    llvm::SmallVector<llvm::Type*, RsExpandKernelDriverInfoPfxFieldCount> RsExpandKernelDriverInfoPfxTypes;
275    RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT]
276    RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy);   // uint32_t inStride[RS_KERNEL_INPUT_LIMIT]
277    RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t inLen
278    RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT]
279    RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy);   // uint32_t outStride[RS_KERNEL_INPUT_LIMIT]
280    RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t outLen
281    RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy);     // RsLaunchDimensions dim
282    RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy);     // RsLaunchDimensions current
283    RsExpandKernelDriverInfoPfxTypes.push_back(VoidPtrTy);                // const void *usr
284    RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t usrLen
285    llvm::StructType *RsExpandKernelDriverInfoPfxTy =
286        llvm::StructType::create(RsExpandKernelDriverInfoPfxTypes, "RsExpandKernelDriverInfoPfx");
287
288    // Create the function type for expanded kernels.
289
290    llvm::Type *RsExpandKernelDriverInfoPfxPtrTy = RsExpandKernelDriverInfoPfxTy->getPointerTo();
291
292    llvm::SmallVector<llvm::Type*, 8> ParamTypes;
293    ParamTypes.push_back(RsExpandKernelDriverInfoPfxPtrTy); // const RsExpandKernelDriverInfoPfx *p
294    ParamTypes.push_back(Int32Ty);                          // uint32_t x1
295    ParamTypes.push_back(Int32Ty);                          // uint32_t x2
296    ParamTypes.push_back(Int32Ty);                          // uint32_t outstep
297
298    ExpandedFunctionType =
299        llvm::FunctionType::get(llvm::Type::getVoidTy(*Context), ParamTypes,
300                                false);
301  }
302
303  /// @brief Create skeleton of the expanded function.
304  ///
305  /// This creates a function with the following signature:
306  ///
307  ///   void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2,
308  ///         uint32_t outstep)
309  ///
310  llvm::Function *createEmptyExpandedFunction(llvm::StringRef OldName) {
311    llvm::Function *ExpandedFunction =
312      llvm::Function::Create(ExpandedFunctionType,
313                             llvm::GlobalValue::ExternalLinkage,
314                             OldName + ".expand", Module);
315
316    bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
317
318    llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
319
320    (AI++)->setName("p");
321    (AI++)->setName("x1");
322    (AI++)->setName("x2");
323    (AI++)->setName("arg_outstep");
324
325    llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
326                                                       ExpandedFunction);
327    llvm::IRBuilder<> Builder(Begin);
328    Builder.CreateRetVoid();
329
330    return ExpandedFunction;
331  }
332
333  /// @brief Create an empty loop
334  ///
335  /// Create a loop of the form:
336  ///
337  /// for (i = LowerBound; i < UpperBound; i++)
338  ///   ;
339  ///
340  /// After the loop has been created, the builder is set such that
341  /// instructions can be added to the loop body.
342  ///
343  /// @param Builder The builder to use to build this loop. The current
344  ///                position of the builder is the position the loop
345  ///                will be inserted.
346  /// @param LowerBound The first value of the loop iterator
347  /// @param UpperBound The maximal value of the loop iterator
348  /// @param LoopIV A reference that will be set to the loop iterator.
349  /// @return The BasicBlock that will be executed after the loop.
350  llvm::BasicBlock *createLoop(llvm::IRBuilder<> &Builder,
351                               llvm::Value *LowerBound,
352                               llvm::Value *UpperBound,
353                               llvm::PHINode **LoopIV) {
354    assert(LowerBound->getType() == UpperBound->getType());
355
356    llvm::BasicBlock *CondBB, *AfterBB, *HeaderBB;
357    llvm::Value *Cond, *IVNext;
358    llvm::PHINode *IV;
359
360    CondBB = Builder.GetInsertBlock();
361    AfterBB = llvm::SplitBlock(CondBB, Builder.GetInsertPoint(), nullptr, nullptr);
362    HeaderBB = llvm::BasicBlock::Create(*Context, "Loop", CondBB->getParent());
363
364    // if (LowerBound < Upperbound)
365    //   goto LoopHeader
366    // else
367    //   goto AfterBB
368    CondBB->getTerminator()->eraseFromParent();
369    Builder.SetInsertPoint(CondBB);
370    Cond = Builder.CreateICmpULT(LowerBound, UpperBound);
371    Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
372
373    // iv = PHI [CondBB -> LowerBound], [LoopHeader -> NextIV ]
374    // iv.next = iv + 1
375    // if (iv.next < Upperbound)
376    //   goto LoopHeader
377    // else
378    //   goto AfterBB
379    Builder.SetInsertPoint(HeaderBB);
380    IV = Builder.CreatePHI(LowerBound->getType(), 2, "X");
381    IV->addIncoming(LowerBound, CondBB);
382    IVNext = Builder.CreateNUWAdd(IV, Builder.getInt32(1));
383    IV->addIncoming(IVNext, HeaderBB);
384    Cond = Builder.CreateICmpULT(IVNext, UpperBound);
385    Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
386    AfterBB->setName("Exit");
387    Builder.SetInsertPoint(HeaderBB->getFirstNonPHI());
388    *LoopIV = IV;
389    return AfterBB;
390  }
391
392public:
393  RSForEachExpandPass(bool pEnableStepOpt = true)
394      : ModulePass(ID), Module(nullptr), Context(nullptr),
395        mEnableStepOpt(pEnableStepOpt) {
396
397  }
398
399  virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
400    // This pass does not use any other analysis passes, but it does
401    // add/wrap the existing functions in the module (thus altering the CFG).
402  }
403
404  // Build contribution to outgoing argument list for calling a
405  // ForEach-able function, based on the special parameters of that
406  // function.
407  //
408  // Signature - metadata bits for the signature of the ForEach-able function
409  // X, Arg_p - values derived directly from expanded function,
410  //            suitable for computing arguments for the ForEach-able function
411  // CalleeArgs - contribution is accumulated here
412  // Bump - invoked once for each contributed outgoing argument
413  void ExpandSpecialArguments(uint32_t Signature,
414                              llvm::Value *X,
415                              llvm::Value *Arg_p,
416                              llvm::IRBuilder<> &Builder,
417                              llvm::SmallVector<llvm::Value*, 8> &CalleeArgs,
418                              std::function<void ()> Bump) {
419
420    if (bcinfo::MetadataExtractor::hasForEachSignatureCtxt(Signature)) {
421      CalleeArgs.push_back(Arg_p);
422      Bump();
423    }
424
425    if (bcinfo::MetadataExtractor::hasForEachSignatureX(Signature)) {
426      CalleeArgs.push_back(X);
427      Bump();
428    }
429
430    if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature) ||
431        bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
432
433      llvm::Value *Current = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldCurrent);
434
435      if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature)) {
436        llvm::Value *Y = Builder.CreateLoad(
437            Builder.CreateStructGEP(nullptr, Current, RsLaunchDimensionsFieldY), "Y");
438
439        CalleeArgs.push_back(Y);
440        Bump();
441      }
442
443      if (bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
444        llvm::Value *Z = Builder.CreateLoad(
445            Builder.CreateStructGEP(nullptr, Current, RsLaunchDimensionsFieldZ), "Z");
446        CalleeArgs.push_back(Z);
447        Bump();
448      }
449    }
450  }
451
452  /* Performs the actual optimization on a selected function. On success, the
453   * Module will contain a new function of the name "<NAME>.expand" that
454   * invokes <NAME>() in a loop with the appropriate parameters.
455   */
456  bool ExpandFunction(llvm::Function *Function, uint32_t Signature) {
457    ALOGV("Expanding ForEach-able Function %s",
458          Function->getName().str().c_str());
459
460    if (!Signature) {
461      Signature = getRootSignature(Function);
462      if (!Signature) {
463        // We couldn't determine how to expand this function based on its
464        // function signature.
465        return false;
466      }
467    }
468
469    llvm::DataLayout DL(Module);
470
471    llvm::Function *ExpandedFunction =
472      createEmptyExpandedFunction(Function->getName());
473
474    /*
475     * Extract the expanded function's parameters.  It is guaranteed by
476     * createEmptyExpandedFunction that there will be five parameters.
477     */
478
479    bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
480
481    llvm::Function::arg_iterator ExpandedFunctionArgIter =
482      ExpandedFunction->arg_begin();
483
484    llvm::Value *Arg_p       = &*(ExpandedFunctionArgIter++);
485    llvm::Value *Arg_x1      = &*(ExpandedFunctionArgIter++);
486    llvm::Value *Arg_x2      = &*(ExpandedFunctionArgIter++);
487    llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter);
488
489    llvm::Value *InStep  = nullptr;
490    llvm::Value *OutStep = nullptr;
491
492    // Construct the actual function body.
493    llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
494
495    // Collect and construct the arguments for the kernel().
496    // Note that we load any loop-invariant arguments before entering the Loop.
497    llvm::Function::arg_iterator FunctionArgIter = Function->arg_begin();
498
499    llvm::Type  *InTy      = nullptr;
500    llvm::Value *InBasePtr = nullptr;
501    if (bcinfo::MetadataExtractor::hasForEachSignatureIn(Signature)) {
502      llvm::Value *InsBasePtr  = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldInPtr, "inputs_base");
503
504      llvm::Value *InStepsBase = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldInStride, "insteps_base");
505
506      llvm::Value    *InStepAddr = Builder.CreateConstInBoundsGEP2_32(nullptr, InStepsBase, 0, 0);
507      llvm::LoadInst *InStepArg  = Builder.CreateLoad(InStepAddr,
508                                                      "instep_addr");
509
510      InTy = (FunctionArgIter++)->getType();
511      InStep = getStepValue(&DL, InTy, InStepArg);
512
513      InStep->setName("instep");
514
515      llvm::Value *InputAddr = Builder.CreateConstInBoundsGEP2_32(nullptr, InsBasePtr, 0, 0);
516      InBasePtr = Builder.CreateLoad(InputAddr, "input_base");
517    }
518
519    llvm::Type *OutTy = nullptr;
520    llvm::Value *OutBasePtr = nullptr;
521    if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
522      OutTy = (FunctionArgIter++)->getType();
523      OutStep = getStepValue(&DL, OutTy, Arg_outstep);
524      OutStep->setName("outstep");
525      OutBasePtr = Builder.CreateLoad(
526                     Builder.CreateConstInBoundsGEP2_32(nullptr,
527                         Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldOutPtr),
528                         0, 0));
529    }
530
531    llvm::Value *UsrData = nullptr;
532    if (bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)) {
533      llvm::Type *UsrDataTy = (FunctionArgIter++)->getType();
534      UsrData = Builder.CreatePointerCast(Builder.CreateLoad(
535          Builder.CreateStructGEP(nullptr, Arg_p,  RsExpandKernelDriverInfoPfxFieldUsr)), UsrDataTy);
536      UsrData->setName("UsrData");
537    }
538
539    llvm::PHINode *IV;
540    createLoop(Builder, Arg_x1, Arg_x2, &IV);
541
542    llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
543    ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
544                           [&FunctionArgIter]() { FunctionArgIter++; });
545
546    bccAssert(FunctionArgIter == Function->arg_end());
547
548    // Populate the actual call to kernel().
549    llvm::SmallVector<llvm::Value*, 8> RootArgs;
550
551    llvm::Value *InPtr  = nullptr;
552    llvm::Value *OutPtr = nullptr;
553
554    // Calculate the current input and output pointers
555    //
556    // We always calculate the input/output pointers with a GEP operating on i8
557    // values and only cast at the very end to OutTy. This is because the step
558    // between two values is given in bytes.
559    //
560    // TODO: We could further optimize the output by using a GEP operation of
561    // type 'OutTy' in cases where the element type of the allocation allows.
562    if (OutBasePtr) {
563      llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
564      OutOffset = Builder.CreateMul(OutOffset, OutStep);
565      OutPtr = Builder.CreateGEP(OutBasePtr, OutOffset);
566      OutPtr = Builder.CreatePointerCast(OutPtr, OutTy);
567    }
568
569    if (InBasePtr) {
570      llvm::Value *InOffset = Builder.CreateSub(IV, Arg_x1);
571      InOffset = Builder.CreateMul(InOffset, InStep);
572      InPtr = Builder.CreateGEP(InBasePtr, InOffset);
573      InPtr = Builder.CreatePointerCast(InPtr, InTy);
574    }
575
576    if (InPtr) {
577      RootArgs.push_back(InPtr);
578    }
579
580    if (OutPtr) {
581      RootArgs.push_back(OutPtr);
582    }
583
584    if (UsrData) {
585      RootArgs.push_back(UsrData);
586    }
587
588    RootArgs.append(CalleeArgs.begin(), CalleeArgs.end());
589
590    Builder.CreateCall(Function, RootArgs);
591
592    return true;
593  }
594
595  /* Expand a pass-by-value kernel.
596   */
597  bool ExpandKernel(llvm::Function *Function, uint32_t Signature) {
598    bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature));
599    ALOGV("Expanding kernel Function %s", Function->getName().str().c_str());
600
601    // TODO: Refactor this to share functionality with ExpandFunction.
602    llvm::DataLayout DL(Module);
603
604    llvm::Function *ExpandedFunction =
605      createEmptyExpandedFunction(Function->getName());
606
607    /*
608     * Extract the expanded function's parameters.  It is guaranteed by
609     * createEmptyExpandedFunction that there will be five parameters.
610     */
611
612    bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
613
614    llvm::Function::arg_iterator ExpandedFunctionArgIter =
615      ExpandedFunction->arg_begin();
616
617    llvm::Value *Arg_p       = &*(ExpandedFunctionArgIter++);
618    llvm::Value *Arg_x1      = &*(ExpandedFunctionArgIter++);
619    llvm::Value *Arg_x2      = &*(ExpandedFunctionArgIter++);
620    llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter);
621
622    // Construct the actual function body.
623    llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
624
625    // Create TBAA meta-data.
626    llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript,
627                 *TBAAAllocation, *TBAAPointer;
628    llvm::MDBuilder MDHelper(*Context);
629
630    TBAARenderScriptDistinct =
631      MDHelper.createTBAARoot("RenderScript Distinct TBAA");
632    TBAARenderScript = MDHelper.createTBAANode("RenderScript TBAA",
633        TBAARenderScriptDistinct);
634    TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
635                                                       TBAARenderScript);
636    TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
637                                                      TBAAAllocation, 0);
638    TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer",
639                                                    TBAARenderScript);
640    TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0);
641
642    llvm::MDNode *AliasingDomain, *AliasingScope;
643    AliasingDomain = MDHelper.createAnonymousAliasScopeDomain("RS argument scope domain");
644    AliasingScope = MDHelper.createAnonymousAliasScope(AliasingDomain, "RS argument scope");
645
646    /*
647     * Collect and construct the arguments for the kernel().
648     *
649     * Note that we load any loop-invariant arguments before entering the Loop.
650     */
651    size_t NumInputs = Function->arg_size();
652
653    // No usrData parameter on kernels.
654    bccAssert(
655        !bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature));
656
657    llvm::Function::arg_iterator ArgIter = Function->arg_begin();
658
659    // Check the return type
660    llvm::Type     *OutTy            = nullptr;
661    llvm::Value    *OutStep          = nullptr;
662    llvm::LoadInst *OutBasePtr       = nullptr;
663    llvm::Value    *CastedOutBasePtr = nullptr;
664
665    bool PassOutByPointer = false;
666
667    if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
668      llvm::Type *OutBaseTy = Function->getReturnType();
669
670      if (OutBaseTy->isVoidTy()) {
671        PassOutByPointer = true;
672        OutTy = ArgIter->getType();
673
674        ArgIter++;
675        --NumInputs;
676      } else {
677        // We don't increment Args, since we are using the actual return type.
678        OutTy = OutBaseTy->getPointerTo();
679      }
680
681      OutStep = getStepValue(&DL, OutTy, Arg_outstep);
682      OutStep->setName("outstep");
683      OutBasePtr = Builder.CreateLoad(
684                     Builder.CreateConstInBoundsGEP2_32(nullptr,
685                         Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldOutPtr),
686                         0, 0));
687
688      if (gEnableRsTbaa) {
689        OutBasePtr->setMetadata("tbaa", TBAAPointer);
690      }
691
692      OutBasePtr->setMetadata("alias.scope", AliasingScope);
693
694      CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out");
695    }
696
697    llvm::PHINode *IV;
698    createLoop(Builder, Arg_x1, Arg_x2, &IV);
699
700    llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
701    ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
702                           [&NumInputs]() { --NumInputs; });
703
704    llvm::SmallVector<llvm::Type*,  8> InTypes;
705    llvm::SmallVector<llvm::Value*, 8> InSteps;
706    llvm::SmallVector<llvm::Value*, 8> InBasePtrs;
707    llvm::SmallVector<bool,         8> InIsStructPointer;
708
709    bccAssert(NumInputs <= RS_KERNEL_INPUT_LIMIT);
710
711    if (NumInputs > 0) {
712      llvm::Value *InsBasePtr  = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldInPtr, "inputs_base");
713
714      llvm::Value *InStepsBase = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldInStride, "insteps_base");
715
716      for (size_t InputIndex = 0; InputIndex < NumInputs;
717           ++InputIndex, ArgIter++) {
718
719          llvm::Value    *InStepAddr = Builder.CreateConstInBoundsGEP2_32(nullptr, InStepsBase, 0, InputIndex);
720          llvm::LoadInst *InStepArg  = Builder.CreateLoad(InStepAddr,
721                                                          "instep_addr");
722
723          llvm::Type *InType = ArgIter->getType();
724
725        /*
726         * AArch64 calling dictate that structs of sufficient size get passed by
727         * pointer instead of passed by value.  This, combined with the fact
728         * that we don't allow kernels to operate on pointer data means that if
729         * we see a kernel with a pointer parameter we know that it is struct
730         * input that has been promoted.  As such we don't need to convert its
731         * type to a pointer.  Later we will need to know to avoid a load, so we
732         * save this information in InIsStructPointer.
733         */
734          if (!InType->isPointerTy()) {
735            InType = InType->getPointerTo();
736            InIsStructPointer.push_back(false);
737          } else {
738            InIsStructPointer.push_back(true);
739          }
740
741          llvm::Value *InStep = getStepValue(&DL, InType, InStepArg);
742
743          InStep->setName("instep");
744
745          llvm::Value    *InputAddr = Builder.CreateConstInBoundsGEP2_32(nullptr, InsBasePtr, 0, InputIndex);
746          llvm::LoadInst *InBasePtr = Builder.CreateLoad(InputAddr,
747                                                         "input_base");
748          llvm::Value    *CastInBasePtr = Builder.CreatePointerCast(InBasePtr,
749                                                                    InType, "casted_in");
750          if (gEnableRsTbaa) {
751            InBasePtr->setMetadata("tbaa", TBAAPointer);
752          }
753
754          InBasePtr->setMetadata("alias.scope", AliasingScope);
755
756          InTypes.push_back(InType);
757          InSteps.push_back(InStep);
758          InBasePtrs.push_back(CastInBasePtr);
759      }
760    }
761
762    // Populate the actual call to kernel().
763    llvm::SmallVector<llvm::Value*, 8> RootArgs;
764
765    // Calculate the current input and output pointers
766    //
767    //
768    // We always calculate the input/output pointers with a GEP operating on i8
769    // values combined with a multiplication and only cast at the very end to
770    // OutTy.  This is to account for dynamic stepping sizes when the value
771    // isn't apparent at compile time.  In the (very common) case when we know
772    // the step size at compile time, due to haveing complete type information
773    // this multiplication will optmized out and produces code equivalent to a
774    // a GEP on a pointer of the correct type.
775
776    // Output
777
778    llvm::Value *OutPtr = nullptr;
779    if (CastedOutBasePtr) {
780      llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
781
782      OutPtr    = Builder.CreateGEP(CastedOutBasePtr, OutOffset);
783
784      if (PassOutByPointer) {
785        RootArgs.push_back(OutPtr);
786      }
787    }
788
789    // Inputs
790
791    if (NumInputs > 0) {
792      llvm::Value *Offset = Builder.CreateSub(IV, Arg_x1);
793
794      for (size_t Index = 0; Index < NumInputs; ++Index) {
795        llvm::Value *InPtr    = Builder.CreateGEP(InBasePtrs[Index], Offset);
796        llvm::Value *Input;
797
798        if (InIsStructPointer[Index]) {
799          Input = InPtr;
800
801        } else {
802          llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input");
803
804          if (gEnableRsTbaa) {
805            InputLoad->setMetadata("tbaa", TBAAAllocation);
806          }
807
808          InputLoad->setMetadata("alias.scope", AliasingScope);
809
810          Input = InputLoad;
811        }
812
813        RootArgs.push_back(Input);
814      }
815    }
816
817    RootArgs.append(CalleeArgs.begin(), CalleeArgs.end());
818
819    llvm::Value *RetVal = Builder.CreateCall(Function, RootArgs);
820
821    if (OutPtr && !PassOutByPointer) {
822      llvm::StoreInst *Store = Builder.CreateStore(RetVal, OutPtr);
823      if (gEnableRsTbaa) {
824        Store->setMetadata("tbaa", TBAAAllocation);
825      }
826      Store->setMetadata("alias.scope", AliasingScope);
827    }
828
829    return true;
830  }
831
832  /// @brief Checks if pointers to allocation internals are exposed
833  ///
834  /// This function verifies if through the parameters passed to the kernel
835  /// or through calls to the runtime library the script gains access to
836  /// pointers pointing to data within a RenderScript Allocation.
837  /// If we know we control all loads from and stores to data within
838  /// RenderScript allocations and if we know the run-time internal accesses
839  /// are all annotated with RenderScript TBAA metadata, only then we
840  /// can safely use TBAA to distinguish between generic and from-allocation
841  /// pointers.
842  bool allocPointersExposed(llvm::Module &Module) {
843    // Old style kernel function can expose pointers to elements within
844    // allocations.
845    // TODO: Extend analysis to allow simple cases of old-style kernels.
846    for (size_t i = 0; i < mExportForEachCount; ++i) {
847      const char *Name = mExportForEachNameList[i];
848      uint32_t Signature = mExportForEachSignatureList[i];
849      if (Module.getFunction(Name) &&
850          !bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)) {
851        return true;
852      }
853    }
854
855    // Check for library functions that expose a pointer to an Allocation or
856    // that are not yet annotated with RenderScript-specific tbaa information.
857    static std::vector<std::string> Funcs;
858
859    // rsGetElementAt(...)
860    Funcs.push_back("_Z14rsGetElementAt13rs_allocationj");
861    Funcs.push_back("_Z14rsGetElementAt13rs_allocationjj");
862    Funcs.push_back("_Z14rsGetElementAt13rs_allocationjjj");
863    // rsSetElementAt()
864    Funcs.push_back("_Z14rsSetElementAt13rs_allocationPvj");
865    Funcs.push_back("_Z14rsSetElementAt13rs_allocationPvjj");
866    Funcs.push_back("_Z14rsSetElementAt13rs_allocationPvjjj");
867    // rsGetElementAtYuv_uchar_Y()
868    Funcs.push_back("_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj");
869    // rsGetElementAtYuv_uchar_U()
870    Funcs.push_back("_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj");
871    // rsGetElementAtYuv_uchar_V()
872    Funcs.push_back("_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj");
873
874    for (std::vector<std::string>::iterator FI = Funcs.begin(),
875                                            FE = Funcs.end();
876         FI != FE; ++FI) {
877      llvm::Function *Function = Module.getFunction(*FI);
878
879      if (!Function) {
880        ALOGE("Missing run-time function '%s'", FI->c_str());
881        return true;
882      }
883
884      if (Function->getNumUses() > 0) {
885        return true;
886      }
887    }
888
889    return false;
890  }
891
892  /// @brief Connect RenderScript TBAA metadata to C/C++ metadata
893  ///
894  /// The TBAA metadata used to annotate loads/stores from RenderScript
895  /// Allocations is generated in a separate TBAA tree with a
896  /// "RenderScript Distinct TBAA" root node. LLVM does assume may-alias for
897  /// all nodes in unrelated alias analysis trees. This function makes the
898  /// "RenderScript TBAA" node (which is parented by the Distinct TBAA root),
899  /// a subtree of the normal C/C++ TBAA tree aside of normal C/C++ types. With
900  /// the connected trees every access to an Allocation is resolved to
901  /// must-alias if compared to a normal C/C++ access.
902  void connectRenderScriptTBAAMetadata(llvm::Module &Module) {
903    llvm::MDBuilder MDHelper(*Context);
904    llvm::MDNode *TBAARenderScriptDistinct =
905      MDHelper.createTBAARoot("RenderScript Distinct TBAA");
906    llvm::MDNode *TBAARenderScript = MDHelper.createTBAANode(
907        "RenderScript TBAA", TBAARenderScriptDistinct);
908    llvm::MDNode *TBAARoot     = MDHelper.createTBAARoot("Simple C/C++ TBAA");
909    TBAARenderScript->replaceOperandWith(1, TBAARoot);
910  }
911
912  virtual bool runOnModule(llvm::Module &Module) {
913    bool Changed  = false;
914    this->Module  = &Module;
915    this->Context = &Module.getContext();
916
917    this->buildTypes();
918
919    bcinfo::MetadataExtractor me(&Module);
920    if (!me.extract()) {
921      ALOGE("Could not extract metadata from module!");
922      return false;
923    }
924    mExportForEachCount = me.getExportForEachSignatureCount();
925    mExportForEachNameList = me.getExportForEachNameList();
926    mExportForEachSignatureList = me.getExportForEachSignatureList();
927
928    bool AllocsExposed = allocPointersExposed(Module);
929
930    for (size_t i = 0; i < mExportForEachCount; ++i) {
931      const char *name = mExportForEachNameList[i];
932      uint32_t signature = mExportForEachSignatureList[i];
933      llvm::Function *kernel = Module.getFunction(name);
934      if (kernel) {
935        if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) {
936          Changed |= ExpandKernel(kernel, signature);
937          kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
938        } else if (kernel->getReturnType()->isVoidTy()) {
939          Changed |= ExpandFunction(kernel, signature);
940          kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
941        } else {
942          // There are some graphics root functions that are not
943          // expanded, but that will be called directly. For those
944          // functions, we can not set the linkage to internal.
945        }
946      }
947    }
948
949    if (gEnableRsTbaa && !AllocsExposed) {
950      connectRenderScriptTBAAMetadata(Module);
951    }
952
953    return Changed;
954  }
955
956  virtual const char *getPassName() const {
957    return "ForEach-able Function Expansion";
958  }
959
960}; // end RSForEachExpandPass
961
962} // end anonymous namespace
963
964char RSForEachExpandPass::ID = 0;
965static llvm::RegisterPass<RSForEachExpandPass> X("foreachexp", "ForEach Expand Pass");
966
967namespace bcc {
968
969llvm::ModulePass *
970createRSForEachExpandPass(bool pEnableStepOpt){
971  return new RSForEachExpandPass(pEnableStepOpt);
972}
973
974} // end namespace bcc
975