RSKernelExpand.cpp revision 97e50993c70083fdedb4f1dd2c487aa55c6f60cf
1/*
2 * Copyright 2012, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "Assert.h"
18#include "Log.h"
19#include "RSTransforms.h"
20#include "RSUtils.h"
21
22#include "bcc/Config.h"
23#include "bcinfo/MetadataExtractor.h"
24
25#include "slang_version.h"
26
27#include <cstdlib>
28#include <functional>
29#include <unordered_set>
30
31#include <llvm/IR/DerivedTypes.h>
32#include <llvm/IR/Function.h>
33#include <llvm/IR/Instructions.h>
34#include <llvm/IR/IRBuilder.h>
35#include <llvm/IR/MDBuilder.h>
36#include <llvm/IR/Module.h>
37#include <llvm/Pass.h>
38#include <llvm/Support/raw_ostream.h>
39#include <llvm/IR/DataLayout.h>
40#include <llvm/IR/Function.h>
41#include <llvm/IR/Type.h>
42#include <llvm/Transforms/Utils/BasicBlockUtils.h>
43
44#ifndef __DISABLE_ASSERTS
45// Only used in bccAssert()
46const int kNumExpandedForeachParams = 4;
47const int kNumExpandedReduceAccumulatorParams = 4;
48#endif
49
50const char kRenderScriptTBAARootName[] = "RenderScript Distinct TBAA";
51const char kRenderScriptTBAANodeName[] = "RenderScript TBAA";
52
53using namespace bcc;
54
55namespace {
56
57static const bool gEnableRsTbaa = true;
58
59/* RSKernelExpandPass
60 *
61 * This pass generates functions used to implement calls via
62 * rsForEach(), "foreach_<NAME>", or "reduce_<NAME>". We create an
63 * inner loop for the function to be invoked over the appropriate data
64 * cells of the input/output allocations (adjusting other relevant
65 * parameters as we go). We support doing this for any forEach or
66 * reduce style compute kernels.
67 *
68 * In the case of a foreach kernel or a simple reduction kernel, the
69 * new function name is the original function name "<NAME>" followed
70 * by ".expand" -- "<NAME>.expand".
71 *
72 * In the case of a general reduction kernel, the kernel's accumulator
73 * function is the one transformed, and the new function name is the
74 * original accumulator function name "<ACCUMFN>" followed by
75 * ".expand" -- "<ACCUMFN>.expand". Using the name "<ACCUMFN>.expand"
76 * for the function generated from the accumulator should not
77 * introduce any possibility for name clashes today: The accumulator
78 * function <ACCUMFN> must be static, so it cannot also serve as a
79 * foreach kernel; and the code for <ACCUMFN>.expand depends only on
80 * <ACCUMFN>, not on any other properties of the reduction kernel, so
81 * any reduction kernels that share the accumulator <ACCUMFN> can
82 * share <ACCUMFN>.expand also.
83 *
84 * Note that this pass does not delete the original function <NAME> or
85 * <ACCUMFN>. However, if it is inlined into the newly-generated
86 * function and not otherwise referenced, then a subsequent pass may
87 * delete it.
88 */
89class RSKernelExpandPass : public llvm::ModulePass {
90public:
91  static char ID;
92
93private:
94  static const size_t RS_KERNEL_INPUT_LIMIT = 8;  // see frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h
95
96  typedef std::unordered_set<llvm::Function *> FunctionSet;
97
98  enum RsLaunchDimensionsField {
99    RsLaunchDimensionsFieldX,
100    RsLaunchDimensionsFieldY,
101    RsLaunchDimensionsFieldZ,
102    RsLaunchDimensionsFieldLod,
103    RsLaunchDimensionsFieldFace,
104    RsLaunchDimensionsFieldArray,
105
106    RsLaunchDimensionsFieldCount
107  };
108
109  enum RsExpandKernelDriverInfoPfxField {
110    RsExpandKernelDriverInfoPfxFieldInPtr,
111    RsExpandKernelDriverInfoPfxFieldInStride,
112    RsExpandKernelDriverInfoPfxFieldInLen,
113    RsExpandKernelDriverInfoPfxFieldOutPtr,
114    RsExpandKernelDriverInfoPfxFieldOutStride,
115    RsExpandKernelDriverInfoPfxFieldOutLen,
116    RsExpandKernelDriverInfoPfxFieldDim,
117    RsExpandKernelDriverInfoPfxFieldCurrent,
118    RsExpandKernelDriverInfoPfxFieldUsr,
119    RsExpandKernelDriverInfoPfxFieldUsLenr,
120
121    RsExpandKernelDriverInfoPfxFieldCount
122  };
123
124  llvm::Module *Module;
125  llvm::LLVMContext *Context;
126
127  /*
128   * Pointers to LLVM type information for the the function signatures
129   * for expanded functions. These must be re-calculated for each module
130   * the pass is run on.
131   */
132  llvm::FunctionType *ExpandedForEachType;
133  llvm::Type *RsExpandKernelDriverInfoPfxTy;
134
135  // Initialized when we begin to process each Module
136  bool mStructExplicitlyPaddedBySlang;
137  uint32_t mExportForEachCount;
138  const char **mExportForEachNameList;
139  const uint32_t *mExportForEachSignatureList;
140
141  // Turns on optimization of allocation stride values.
142  bool mEnableStepOpt;
143
144  uint32_t getRootSignature(llvm::Function *Function) {
145    const llvm::NamedMDNode *ExportForEachMetadata =
146        Module->getNamedMetadata("#rs_export_foreach");
147
148    if (!ExportForEachMetadata) {
149      llvm::SmallVector<llvm::Type*, 8> RootArgTys;
150      for (llvm::Function::arg_iterator B = Function->arg_begin(),
151                                        E = Function->arg_end();
152           B != E;
153           ++B) {
154        RootArgTys.push_back(B->getType());
155      }
156
157      // For pre-ICS bitcode, we may not have signature information. In that
158      // case, we use the size of the RootArgTys to select the number of
159      // arguments.
160      return (1 << RootArgTys.size()) - 1;
161    }
162
163    if (ExportForEachMetadata->getNumOperands() == 0) {
164      return 0;
165    }
166
167    bccAssert(ExportForEachMetadata->getNumOperands() > 0);
168
169    // We only handle the case for legacy root() functions here, so this is
170    // hard-coded to look at only the first such function.
171    llvm::MDNode *SigNode = ExportForEachMetadata->getOperand(0);
172    if (SigNode != nullptr && SigNode->getNumOperands() == 1) {
173      llvm::Metadata *SigMD = SigNode->getOperand(0);
174      if (llvm::MDString *SigS = llvm::dyn_cast<llvm::MDString>(SigMD)) {
175        llvm::StringRef SigString = SigS->getString();
176        uint32_t Signature = 0;
177        if (SigString.getAsInteger(10, Signature)) {
178          ALOGE("Non-integer signature value '%s'", SigString.str().c_str());
179          return 0;
180        }
181        return Signature;
182      }
183    }
184
185    return 0;
186  }
187
188  bool isStepOptSupported(llvm::Type *AllocType) {
189
190    llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
191    llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context);
192
193    if (mEnableStepOpt) {
194      return false;
195    }
196
197    if (AllocType == VoidPtrTy) {
198      return false;
199    }
200
201    if (!PT) {
202      return false;
203    }
204
205    // remaining conditions are 64-bit only
206    if (VoidPtrTy->getPrimitiveSizeInBits() == 32) {
207      return true;
208    }
209
210    // coerce suggests an upconverted struct type, which we can't support
211    if (AllocType->getStructName().find("coerce") != llvm::StringRef::npos) {
212      return false;
213    }
214
215    // 2xi64 and i128 suggest an upconverted struct type, which are also unsupported
216    llvm::Type *V2xi64Ty = llvm::VectorType::get(llvm::Type::getInt64Ty(*Context), 2);
217    llvm::Type *Int128Ty = llvm::Type::getIntNTy(*Context, 128);
218    if (AllocType == V2xi64Ty || AllocType == Int128Ty) {
219      return false;
220    }
221
222    return true;
223  }
224
225  // Get the actual value we should use to step through an allocation.
226  //
227  // Normally the value we use to step through an allocation is given to us by
228  // the driver. However, for certain primitive data types, we can derive an
229  // integer constant for the step value. We use this integer constant whenever
230  // possible to allow further compiler optimizations to take place.
231  //
232  // DL - Target Data size/layout information.
233  // T - Type of allocation (should be a pointer).
234  // OrigStep - Original step increment (root.expand() input from driver).
235  llvm::Value *getStepValue(llvm::DataLayout *DL, llvm::Type *AllocType,
236                            llvm::Value *OrigStep) {
237    bccAssert(DL);
238    bccAssert(AllocType);
239    bccAssert(OrigStep);
240    llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
241    if (isStepOptSupported(AllocType)) {
242      llvm::Type *ET = PT->getElementType();
243      uint64_t ETSize = DL->getTypeAllocSize(ET);
244      llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
245      return llvm::ConstantInt::get(Int32Ty, ETSize);
246    } else {
247      return OrigStep;
248    }
249  }
250
251  /// Builds the types required by the pass for the given context.
252  void buildTypes(void) {
253    // Create the RsLaunchDimensionsTy and RsExpandKernelDriverInfoPfxTy structs.
254
255    llvm::Type *Int8Ty                   = llvm::Type::getInt8Ty(*Context);
256    llvm::Type *Int8PtrTy                = Int8Ty->getPointerTo();
257    llvm::Type *Int8PtrArrayInputLimitTy = llvm::ArrayType::get(Int8PtrTy, RS_KERNEL_INPUT_LIMIT);
258    llvm::Type *Int32Ty                  = llvm::Type::getInt32Ty(*Context);
259    llvm::Type *Int32ArrayInputLimitTy   = llvm::ArrayType::get(Int32Ty, RS_KERNEL_INPUT_LIMIT);
260    llvm::Type *VoidPtrTy                = llvm::Type::getInt8PtrTy(*Context);
261    llvm::Type *Int32Array4Ty            = llvm::ArrayType::get(Int32Ty, 4);
262
263    /* Defined in frameworks/base/libs/rs/cpu_ref/rsCpuCore.h:
264     *
265     * struct RsLaunchDimensions {
266     *   uint32_t x;
267     *   uint32_t y;
268     *   uint32_t z;
269     *   uint32_t lod;
270     *   uint32_t face;
271     *   uint32_t array[4];
272     * };
273     */
274    llvm::SmallVector<llvm::Type*, RsLaunchDimensionsFieldCount> RsLaunchDimensionsTypes;
275    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t x
276    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t y
277    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t z
278    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t lod
279    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t face
280    RsLaunchDimensionsTypes.push_back(Int32Array4Ty); // uint32_t array[4]
281    llvm::StructType *RsLaunchDimensionsTy =
282        llvm::StructType::create(RsLaunchDimensionsTypes, "RsLaunchDimensions");
283
284    /* Defined as the beginning of RsExpandKernelDriverInfo in frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h:
285     *
286     * struct RsExpandKernelDriverInfoPfx {
287     *     const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT];
288     *     uint32_t inStride[RS_KERNEL_INPUT_LIMIT];
289     *     uint32_t inLen;
290     *
291     *     uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT];
292     *     uint32_t outStride[RS_KERNEL_INPUT_LIMIT];
293     *     uint32_t outLen;
294     *
295     *     // Dimension of the launch
296     *     RsLaunchDimensions dim;
297     *
298     *     // The walking iterator of the launch
299     *     RsLaunchDimensions current;
300     *
301     *     const void *usr;
302     *     uint32_t usrLen;
303     *
304     *     // Items below this line are not used by the compiler and can be change in the driver.
305     *     // So the compiler must assume there are an unknown number of fields of unknown type
306     *     // beginning here.
307     * };
308     *
309     * The name "RsExpandKernelDriverInfoPfx" is known to RSInvariantPass (RSInvariant.cpp).
310     */
311    llvm::SmallVector<llvm::Type*, RsExpandKernelDriverInfoPfxFieldCount> RsExpandKernelDriverInfoPfxTypes;
312    RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT]
313    RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy);   // uint32_t inStride[RS_KERNEL_INPUT_LIMIT]
314    RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t inLen
315    RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT]
316    RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy);   // uint32_t outStride[RS_KERNEL_INPUT_LIMIT]
317    RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t outLen
318    RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy);     // RsLaunchDimensions dim
319    RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy);     // RsLaunchDimensions current
320    RsExpandKernelDriverInfoPfxTypes.push_back(VoidPtrTy);                // const void *usr
321    RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t usrLen
322    RsExpandKernelDriverInfoPfxTy =
323        llvm::StructType::create(RsExpandKernelDriverInfoPfxTypes, "RsExpandKernelDriverInfoPfx");
324
325    // Create the function type for expanded kernels.
326    llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context);
327
328    llvm::Type *RsExpandKernelDriverInfoPfxPtrTy = RsExpandKernelDriverInfoPfxTy->getPointerTo();
329    // void (const RsExpandKernelDriverInfoPfxTy *p, uint32_t x1, uint32_t x2, uint32_t outstep)
330    ExpandedForEachType = llvm::FunctionType::get(VoidTy,
331        {RsExpandKernelDriverInfoPfxPtrTy, Int32Ty, Int32Ty, Int32Ty}, false);
332  }
333
334  /// @brief Create skeleton of the expanded foreach kernel.
335  ///
336  /// This creates a function with the following signature:
337  ///
338  ///   void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2,
339  ///         uint32_t outstep)
340  ///
341  llvm::Function *createEmptyExpandedForEachKernel(llvm::StringRef OldName) {
342    llvm::Function *ExpandedFunction =
343      llvm::Function::Create(ExpandedForEachType,
344                             llvm::GlobalValue::ExternalLinkage,
345                             OldName + ".expand", Module);
346    bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
347    llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
348    (AI++)->setName("p");
349    (AI++)->setName("x1");
350    (AI++)->setName("x2");
351    (AI++)->setName("arg_outstep");
352    llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
353                                                       ExpandedFunction);
354    llvm::IRBuilder<> Builder(Begin);
355    Builder.CreateRetVoid();
356    return ExpandedFunction;
357  }
358
359  // Create skeleton of a general reduce kernel's expanded accumulator.
360  //
361  // This creates a function with the following signature:
362  //
363  //  void @func.expand(%RsExpandKernelDriverInfoPfx* nocapture %p,
364  //                    i32 %x1, i32 %x2, accumType* nocapture %accum)
365  //
366  llvm::Function *createEmptyExpandedReduceAccumulator(llvm::StringRef OldName,
367                                                       llvm::Type *AccumArgTy) {
368    llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
369    llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context);
370    llvm::FunctionType *ExpandedReduceAccumulatorType =
371        llvm::FunctionType::get(VoidTy,
372                                {RsExpandKernelDriverInfoPfxTy->getPointerTo(),
373                                 Int32Ty, Int32Ty, AccumArgTy}, false);
374    llvm::Function *FnExpandedAccumulator =
375      llvm::Function::Create(ExpandedReduceAccumulatorType,
376                             llvm::GlobalValue::ExternalLinkage,
377                             OldName + ".expand", Module);
378    bccAssert(FnExpandedAccumulator->arg_size() == kNumExpandedReduceAccumulatorParams);
379
380    llvm::Function::arg_iterator AI = FnExpandedAccumulator->arg_begin();
381
382    using llvm::Attribute;
383
384    llvm::Argument *Arg_p = &(*AI++);
385    Arg_p->setName("p");
386    Arg_p->addAttr(llvm::AttributeSet::get(*Context, Arg_p->getArgNo() + 1,
387                                           llvm::makeArrayRef(Attribute::NoCapture)));
388
389    llvm::Argument *Arg_x1 = &(*AI++);
390    Arg_x1->setName("x1");
391
392    llvm::Argument *Arg_x2 = &(*AI++);
393    Arg_x2->setName("x2");
394
395    llvm::Argument *Arg_accum = &(*AI++);
396    Arg_accum->setName("accum");
397    Arg_accum->addAttr(llvm::AttributeSet::get(*Context, Arg_accum->getArgNo() + 1,
398                                               llvm::makeArrayRef(Attribute::NoCapture)));
399
400    llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
401                                                       FnExpandedAccumulator);
402    llvm::IRBuilder<> Builder(Begin);
403    Builder.CreateRetVoid();
404
405    return FnExpandedAccumulator;
406  }
407
408  /// @brief Create an empty loop
409  ///
410  /// Create a loop of the form:
411  ///
412  /// for (i = LowerBound; i < UpperBound; i++)
413  ///   ;
414  ///
415  /// After the loop has been created, the builder is set such that
416  /// instructions can be added to the loop body.
417  ///
418  /// @param Builder The builder to use to build this loop. The current
419  ///                position of the builder is the position the loop
420  ///                will be inserted.
421  /// @param LowerBound The first value of the loop iterator
422  /// @param UpperBound The maximal value of the loop iterator
423  /// @param LoopIV A reference that will be set to the loop iterator.
424  /// @return The BasicBlock that will be executed after the loop.
425  llvm::BasicBlock *createLoop(llvm::IRBuilder<> &Builder,
426                               llvm::Value *LowerBound,
427                               llvm::Value *UpperBound,
428                               llvm::Value **LoopIV) {
429    bccAssert(LowerBound->getType() == UpperBound->getType());
430
431    llvm::BasicBlock *CondBB, *AfterBB, *HeaderBB;
432    llvm::Value *Cond, *IVNext, *IV, *IVVar;
433
434    CondBB = Builder.GetInsertBlock();
435    AfterBB = llvm::SplitBlock(CondBB, &*Builder.GetInsertPoint(), nullptr, nullptr);
436    HeaderBB = llvm::BasicBlock::Create(*Context, "Loop", CondBB->getParent());
437
438    CondBB->getTerminator()->eraseFromParent();
439    Builder.SetInsertPoint(CondBB);
440
441    // decltype(LowerBound) *ivvar = alloca(sizeof(int))
442    // *ivvar = LowerBound
443    IVVar = Builder.CreateAlloca(LowerBound->getType(), nullptr, BCC_INDEX_VAR_NAME);
444    Builder.CreateStore(LowerBound, IVVar);
445
446    // if (LowerBound < Upperbound)
447    //   goto LoopHeader
448    // else
449    //   goto AfterBB
450    Cond = Builder.CreateICmpULT(LowerBound, UpperBound);
451    Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
452
453    // LoopHeader:
454    //   iv = *ivvar
455    //   <insertion point here>
456    //   iv.next = iv + 1
457    //   *ivvar = iv.next
458    //   if (iv.next < Upperbound)
459    //     goto LoopHeader
460    //   else
461    //     goto AfterBB
462    // AfterBB:
463    Builder.SetInsertPoint(HeaderBB);
464    IV = Builder.CreateLoad(IVVar, "X");
465    IVNext = Builder.CreateNUWAdd(IV, Builder.getInt32(1));
466    Builder.CreateStore(IVNext, IVVar);
467    Cond = Builder.CreateICmpULT(IVNext, UpperBound);
468    Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
469    AfterBB->setName("Exit");
470    Builder.SetInsertPoint(llvm::cast<llvm::Instruction>(IVNext));
471
472    // Record information about this loop.
473    *LoopIV = IV;
474    return AfterBB;
475  }
476
477  // Finish building the outgoing argument list for calling a ForEach-able function.
478  //
479  // ArgVector - on input, the non-special arguments
480  //             on output, the non-special arguments combined with the special arguments
481  //               from SpecialArgVector
482  // SpecialArgVector - special arguments (from ExpandSpecialArguments())
483  // SpecialArgContextIdx - return value of ExpandSpecialArguments()
484  //                          (position of context argument in SpecialArgVector)
485  // CalleeFunction - the ForEach-able function being called
486  // Builder - for inserting code into the caller function
487  template<unsigned int ArgVectorLen, unsigned int SpecialArgVectorLen>
488  void finishArgList(      llvm::SmallVector<llvm::Value *, ArgVectorLen>        &ArgVector,
489                     const llvm::SmallVector<llvm::Value *, SpecialArgVectorLen> &SpecialArgVector,
490                     const int SpecialArgContextIdx,
491                     const llvm::Function &CalleeFunction,
492                     llvm::IRBuilder<> &CallerBuilder) {
493    /* The context argument (if any) is a pointer to an opaque user-visible type that differs from
494     * the RsExpandKernelDriverInfoPfx type used in the function we are generating (although the
495     * two types represent the same thing).  Therefore, we must introduce a pointer cast when
496     * generating a call to the kernel function.
497     */
498    const int ArgContextIdx =
499        SpecialArgContextIdx >= 0 ? (ArgVector.size() + SpecialArgContextIdx) : SpecialArgContextIdx;
500    ArgVector.append(SpecialArgVector.begin(), SpecialArgVector.end());
501    if (ArgContextIdx >= 0) {
502      llvm::Type *ContextArgType = nullptr;
503      int ArgIdx = ArgContextIdx;
504      for (const auto &Arg : CalleeFunction.getArgumentList()) {
505        if (!ArgIdx--) {
506          ContextArgType = Arg.getType();
507          break;
508        }
509      }
510      bccAssert(ContextArgType);
511      ArgVector[ArgContextIdx] = CallerBuilder.CreatePointerCast(ArgVector[ArgContextIdx], ContextArgType);
512    }
513  }
514
515  // GEPHelper() returns a SmallVector of values suitable for passing
516  // to IRBuilder::CreateGEP(), and SmallGEPIndices is a typedef for
517  // the returned data type. It is sized so that the SmallVector
518  // returned by GEPHelper() never needs to do a heap allocation for
519  // any list of GEP indices it encounters in the code.
520  typedef llvm::SmallVector<llvm::Value *, 3> SmallGEPIndices;
521
522  // Helper for turning a list of constant integer GEP indices into a
523  // SmallVector of llvm::Value*. The return value is suitable for
524  // passing to a GetElementPtrInst constructor or IRBuilder::CreateGEP().
525  //
526  // Inputs:
527  //   I32Args should be integers which represent the index arguments
528  //   to a GEP instruction.
529  //
530  // Returns:
531  //   Returns a SmallVector of ConstantInts.
532  SmallGEPIndices GEPHelper(const std::initializer_list<int32_t> I32Args) {
533    SmallGEPIndices Out(I32Args.size());
534    llvm::IntegerType *I32Ty = llvm::Type::getInt32Ty(*Context);
535    std::transform(I32Args.begin(), I32Args.end(), Out.begin(),
536                   [I32Ty](int32_t Arg) { return llvm::ConstantInt::get(I32Ty, Arg); });
537    return Out;
538  }
539
540public:
541  explicit RSKernelExpandPass(bool pEnableStepOpt = true)
542      : ModulePass(ID), Module(nullptr), Context(nullptr),
543        mEnableStepOpt(pEnableStepOpt) {
544
545  }
546
547  virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
548    // This pass does not use any other analysis passes, but it does
549    // add/wrap the existing functions in the module (thus altering the CFG).
550  }
551
552  // Build contribution to outgoing argument list for calling a
553  // ForEach-able function or a general reduction accumulator
554  // function, based on the special parameters of that function.
555  //
556  // Signature - metadata bits for the signature of the callee
557  // X, Arg_p - values derived directly from expanded function,
558  //            suitable for computing arguments for the callee
559  // CalleeArgs - contribution is accumulated here
560  // Bump - invoked once for each contributed outgoing argument
561  // LoopHeaderInsertionPoint - an Instruction in the loop header, before which
562  //                            this function can insert loop-invariant loads
563  //
564  // Return value is the (zero-based) position of the context (Arg_p)
565  // argument in the CalleeArgs vector, or a negative value if the
566  // context argument is not placed in the CalleeArgs vector.
567  int ExpandSpecialArguments(uint32_t Signature,
568                             llvm::Value *X,
569                             llvm::Value *Arg_p,
570                             llvm::IRBuilder<> &Builder,
571                             llvm::SmallVector<llvm::Value*, 8> &CalleeArgs,
572                             const std::function<void ()> &Bump,
573                             llvm::Instruction *LoopHeaderInsertionPoint) {
574
575    bccAssert(CalleeArgs.empty());
576
577    int Return = -1;
578    if (bcinfo::MetadataExtractor::hasForEachSignatureCtxt(Signature)) {
579      CalleeArgs.push_back(Arg_p);
580      Bump();
581      Return = CalleeArgs.size() - 1;
582    }
583
584    if (bcinfo::MetadataExtractor::hasForEachSignatureX(Signature)) {
585      CalleeArgs.push_back(X);
586      Bump();
587    }
588
589    if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature) ||
590        bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
591      bccAssert(LoopHeaderInsertionPoint);
592
593      // Y and Z are loop invariant, so they can be hoisted out of the
594      // loop. Set the IRBuilder insertion point to the loop header.
595      auto OldInsertionPoint = Builder.saveIP();
596      Builder.SetInsertPoint(LoopHeaderInsertionPoint);
597
598      if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature)) {
599        SmallGEPIndices YValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent,
600          RsLaunchDimensionsFieldY}));
601        llvm::Value *YAddr = Builder.CreateInBoundsGEP(Arg_p, YValueGEP, "Y.gep");
602        CalleeArgs.push_back(Builder.CreateLoad(YAddr, "Y"));
603        Bump();
604      }
605
606      if (bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
607        SmallGEPIndices ZValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent,
608          RsLaunchDimensionsFieldZ}));
609        llvm::Value *ZAddr = Builder.CreateInBoundsGEP(Arg_p, ZValueGEP, "Z.gep");
610        CalleeArgs.push_back(Builder.CreateLoad(ZAddr, "Z"));
611        Bump();
612      }
613
614      Builder.restoreIP(OldInsertionPoint);
615    }
616
617    return Return;
618  }
619
620  // Generate loop-invariant input processing setup code for an expanded
621  // ForEach-able function or an expanded general reduction accumulator
622  // function.
623  //
624  // LoopHeader - block at the end of which the setup code will be inserted
625  // Arg_p - RSKernelDriverInfo pointer passed to the expanded function
626  // TBAAPointer - metadata for marking loads of pointer values out of RSKernelDriverInfo
627  // ArgIter - iterator pointing to first input of the UNexpanded function
628  // NumInputs - number of inputs (NOT number of ARGUMENTS)
629  //
630  // InTypes[] - this function saves input type, they will be used in ExpandInputsBody().
631  // InBufPtrs[] - this function sets each array element to point to the first cell / byte
632  //               (byte for x86, cell for other platforms) of the corresponding input allocation
633  // InStructTempSlots[] - this function sets each array element either to nullptr
634  //                       or to the result of an alloca (for the case where the
635  //                       calling convention dictates that a value must be passed
636  //                       by reference, and so we need a stacked temporary to hold
637  //                       a copy of that value)
638  void ExpandInputsLoopInvariant(llvm::IRBuilder<> &Builder, llvm::BasicBlock *LoopHeader,
639                                 llvm::Value *Arg_p,
640                                 llvm::MDNode *TBAAPointer,
641                                 llvm::Function::arg_iterator ArgIter,
642                                 const size_t NumInputs,
643                                 llvm::SmallVectorImpl<llvm::Type *> &InTypes,
644                                 llvm::SmallVectorImpl<llvm::Value *> &InBufPtrs,
645                                 llvm::SmallVectorImpl<llvm::Value *> &InStructTempSlots) {
646    bccAssert(NumInputs <= RS_KERNEL_INPUT_LIMIT);
647
648    // Extract information about input slots. The work done
649    // here is loop-invariant, so we can hoist the operations out of the loop.
650    auto OldInsertionPoint = Builder.saveIP();
651    Builder.SetInsertPoint(LoopHeader->getTerminator());
652
653    for (size_t InputIndex = 0; InputIndex < NumInputs; ++InputIndex, ArgIter++) {
654      llvm::Type *InType = ArgIter->getType();
655
656      /*
657       * AArch64 calling conventions dictate that structs of sufficient size
658       * get passed by pointer instead of passed by value.  This, combined
659       * with the fact that we don't allow kernels to operate on pointer
660       * data means that if we see a kernel with a pointer parameter we know
661       * that it is a struct input that has been promoted.  As such we don't
662       * need to convert its type to a pointer.  Later we will need to know
663       * to create a temporary copy on the stack, so we save this information
664       * in InStructTempSlots.
665       */
666      if (auto PtrType = llvm::dyn_cast<llvm::PointerType>(InType)) {
667        llvm::Type *ElementType = PtrType->getElementType();
668        InStructTempSlots.push_back(Builder.CreateAlloca(ElementType, nullptr,
669                                                         "input_struct_slot"));
670      } else {
671        InType = InType->getPointerTo();
672        InStructTempSlots.push_back(nullptr);
673      }
674
675      SmallGEPIndices InBufPtrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr,
676                                             static_cast<int32_t>(InputIndex)}));
677      llvm::Value    *InBufPtrAddr = Builder.CreateInBoundsGEP(Arg_p, InBufPtrGEP, "input_buf.gep");
678      llvm::LoadInst *InBufPtr = Builder.CreateLoad(InBufPtrAddr, "input_buf");
679
680      llvm::Value *CastInBufPtr = nullptr;
681      if (mStructExplicitlyPaddedBySlang || (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING)) {
682        CastInBufPtr = Builder.CreatePointerCast(InBufPtr, InType, "casted_in");
683      } else {
684        // The disagreement between module and x86 target machine datalayout
685        // causes mismatched input/output data offset between slang reflected
686        // code and bcc codegen for GetElementPtr. To solve this issue, skip the
687        // cast to InType and leave CastInBufPtr as an int8_t*.  The buffer is
688        // later indexed with an explicit byte offset computed based on
689        // X86_CUSTOM_DL_STRING and then bitcast to actual input type.
690        CastInBufPtr = InBufPtr;
691      }
692
693      if (gEnableRsTbaa) {
694        InBufPtr->setMetadata("tbaa", TBAAPointer);
695      }
696
697      InTypes.push_back(InType);
698      InBufPtrs.push_back(CastInBufPtr);
699    }
700
701    Builder.restoreIP(OldInsertionPoint);
702  }
703
704  // Generate loop-varying input processing code for an expanded ForEach-able function
705  // or an expanded general reduction accumulator function.  Also, for the call to the
706  // UNexpanded function, collect the portion of the argument list corresponding to the
707  // inputs.
708  //
709  // Arg_x1 - first X coordinate to be processed by the expanded function
710  // TBAAAllocation - metadata for marking loads of input values out of allocations
711  // NumInputs -- number of inputs (NOT number of ARGUMENTS)
712  // InTypes[] - this function uses the saved input types in ExpandInputsLoopInvariant()
713  //             to convert the pointer of byte InPtr to its real type.
714  // InBufPtrs[] - this function consumes the information produced by ExpandInputsLoopInvariant()
715  // InStructTempSlots[] - this function consumes the information produced by ExpandInputsLoopInvariant()
716  // IndVar - value of loop induction variable (X coordinate) for a given loop iteration
717  //
718  // RootArgs - this function sets this to the list of outgoing argument values corresponding
719  //            to the inputs
720  void ExpandInputsBody(llvm::IRBuilder<> &Builder,
721                        llvm::Value *Arg_x1,
722                        llvm::MDNode *TBAAAllocation,
723                        const size_t NumInputs,
724                        const llvm::SmallVectorImpl<llvm::Type *> &InTypes,
725                        const llvm::SmallVectorImpl<llvm::Value *> &InBufPtrs,
726                        const llvm::SmallVectorImpl<llvm::Value *> &InStructTempSlots,
727                        llvm::Value *IndVar,
728                        llvm::SmallVectorImpl<llvm::Value *> &RootArgs) {
729    llvm::Value *Offset = Builder.CreateSub(IndVar, Arg_x1);
730    llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
731
732    for (size_t Index = 0; Index < NumInputs; ++Index) {
733
734      llvm::Value *InPtr = nullptr;
735      if (mStructExplicitlyPaddedBySlang || (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING)) {
736        InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], Offset);
737      } else {
738        // Treat x86 input buffer as byte[], get indexed pointer with explicit
739        // byte offset computed using a datalayout based on
740        // X86_CUSTOM_DL_STRING, then bitcast it to actual input type.
741        llvm::DataLayout DL(X86_CUSTOM_DL_STRING);
742        llvm::Type *InTy = InTypes[Index];
743        uint64_t InStep = DL.getTypeAllocSize(InTy->getPointerElementType());
744        llvm::Value *OffsetInBytes = Builder.CreateMul(Offset, llvm::ConstantInt::get(Int32Ty, InStep));
745        InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], OffsetInBytes);
746        InPtr = Builder.CreatePointerCast(InPtr, InTy);
747      }
748
749      llvm::Value *Input;
750      llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input");
751
752      if (gEnableRsTbaa) {
753        InputLoad->setMetadata("tbaa", TBAAAllocation);
754      }
755
756      if (llvm::Value *TemporarySlot = InStructTempSlots[Index]) {
757        // Pass a pointer to a temporary on the stack, rather than
758        // passing a pointer to the original value. We do not want
759        // the kernel to potentially modify the input data.
760
761        // Note: don't annotate with TBAA, since the kernel might
762        // have its own TBAA annotations for the pointer argument.
763        Builder.CreateStore(InputLoad, TemporarySlot);
764        Input = TemporarySlot;
765      } else {
766        Input = InputLoad;
767      }
768
769      RootArgs.push_back(Input);
770    }
771  }
772
773  /* Performs the actual optimization on a selected function. On success, the
774   * Module will contain a new function of the name "<NAME>.expand" that
775   * invokes <NAME>() in a loop with the appropriate parameters.
776   */
777  bool ExpandOldStyleForEach(llvm::Function *Function, uint32_t Signature) {
778    ALOGV("Expanding ForEach-able Function %s",
779          Function->getName().str().c_str());
780
781    if (!Signature) {
782      Signature = getRootSignature(Function);
783      if (!Signature) {
784        // We couldn't determine how to expand this function based on its
785        // function signature.
786        return false;
787      }
788    }
789
790    llvm::DataLayout DL(Module);
791    if (!mStructExplicitlyPaddedBySlang && (Module->getTargetTriple() == DEFAULT_X86_TRIPLE_STRING)) {
792      DL.reset(X86_CUSTOM_DL_STRING);
793    }
794
795    llvm::Function *ExpandedFunction =
796      createEmptyExpandedForEachKernel(Function->getName());
797
798    /*
799     * Extract the expanded function's parameters.  It is guaranteed by
800     * createEmptyExpandedForEachKernel that there will be four parameters.
801     */
802
803    bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
804
805    llvm::Function::arg_iterator ExpandedFunctionArgIter =
806      ExpandedFunction->arg_begin();
807
808    llvm::Value *Arg_p       = &*(ExpandedFunctionArgIter++);
809    llvm::Value *Arg_x1      = &*(ExpandedFunctionArgIter++);
810    llvm::Value *Arg_x2      = &*(ExpandedFunctionArgIter++);
811    llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter);
812
813    llvm::Value *InStep  = nullptr;
814    llvm::Value *OutStep = nullptr;
815
816    // Construct the actual function body.
817    llvm::IRBuilder<> Builder(&*ExpandedFunction->getEntryBlock().begin());
818
819    // Collect and construct the arguments for the kernel().
820    // Note that we load any loop-invariant arguments before entering the Loop.
821    llvm::Function::arg_iterator FunctionArgIter = Function->arg_begin();
822
823    llvm::Type  *InTy      = nullptr;
824    llvm::Value *InBufPtr = nullptr;
825    if (bcinfo::MetadataExtractor::hasForEachSignatureIn(Signature)) {
826      SmallGEPIndices InStepGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInStride, 0}));
827      llvm::LoadInst *InStepArg  = Builder.CreateLoad(
828        Builder.CreateInBoundsGEP(Arg_p, InStepGEP, "instep_addr.gep"), "instep_addr");
829
830      InTy = (FunctionArgIter++)->getType();
831      InStep = getStepValue(&DL, InTy, InStepArg);
832
833      InStep->setName("instep");
834
835      SmallGEPIndices InputAddrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr, 0}));
836      InBufPtr = Builder.CreateLoad(
837        Builder.CreateInBoundsGEP(Arg_p, InputAddrGEP, "input_buf.gep"), "input_buf");
838    }
839
840    llvm::Type *OutTy = nullptr;
841    llvm::Value *OutBasePtr = nullptr;
842    if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
843      OutTy = (FunctionArgIter++)->getType();
844      OutStep = getStepValue(&DL, OutTy, Arg_outstep);
845      OutStep->setName("outstep");
846      SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0}));
847      OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep"));
848    }
849
850    llvm::Value *UsrData = nullptr;
851    if (bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)) {
852      llvm::Type *UsrDataTy = (FunctionArgIter++)->getType();
853      llvm::Value *UsrDataPointerAddr = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldUsr);
854      UsrData = Builder.CreatePointerCast(Builder.CreateLoad(UsrDataPointerAddr), UsrDataTy);
855      UsrData->setName("UsrData");
856    }
857
858    llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock();
859    llvm::Value *IV;
860    createLoop(Builder, Arg_x1, Arg_x2, &IV);
861
862    llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
863    const int CalleeArgsContextIdx = ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
864                                                            [&FunctionArgIter]() { FunctionArgIter++; },
865                                                            LoopHeader->getTerminator());
866
867    bccAssert(FunctionArgIter == Function->arg_end());
868
869    // Populate the actual call to kernel().
870    llvm::SmallVector<llvm::Value*, 8> RootArgs;
871
872    llvm::Value *InPtr  = nullptr;
873    llvm::Value *OutPtr = nullptr;
874
875    // Calculate the current input and output pointers
876    //
877    // We always calculate the input/output pointers with a GEP operating on i8
878    // values and only cast at the very end to OutTy. This is because the step
879    // between two values is given in bytes.
880    //
881    // TODO: We could further optimize the output by using a GEP operation of
882    // type 'OutTy' in cases where the element type of the allocation allows.
883    if (OutBasePtr) {
884      llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
885      OutOffset = Builder.CreateMul(OutOffset, OutStep);
886      OutPtr = Builder.CreateInBoundsGEP(OutBasePtr, OutOffset);
887      OutPtr = Builder.CreatePointerCast(OutPtr, OutTy);
888    }
889
890    if (InBufPtr) {
891      llvm::Value *InOffset = Builder.CreateSub(IV, Arg_x1);
892      InOffset = Builder.CreateMul(InOffset, InStep);
893      InPtr = Builder.CreateInBoundsGEP(InBufPtr, InOffset);
894      InPtr = Builder.CreatePointerCast(InPtr, InTy);
895    }
896
897    if (InPtr) {
898      RootArgs.push_back(InPtr);
899    }
900
901    if (OutPtr) {
902      RootArgs.push_back(OutPtr);
903    }
904
905    if (UsrData) {
906      RootArgs.push_back(UsrData);
907    }
908
909    finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder);
910
911    Builder.CreateCall(Function, RootArgs);
912
913    return true;
914  }
915
916  /* Expand a pass-by-value foreach kernel.
917   */
918  bool ExpandForEach(llvm::Function *Function, uint32_t Signature) {
919    bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature));
920    ALOGV("Expanding kernel Function %s", Function->getName().str().c_str());
921
922    // TODO: Refactor this to share functionality with ExpandOldStyleForEach.
923    llvm::DataLayout DL(Module);
924    if (!mStructExplicitlyPaddedBySlang && (Module->getTargetTriple() == DEFAULT_X86_TRIPLE_STRING)) {
925      DL.reset(X86_CUSTOM_DL_STRING);
926    }
927    llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
928
929    llvm::Function *ExpandedFunction =
930      createEmptyExpandedForEachKernel(Function->getName());
931
932    /*
933     * Extract the expanded function's parameters.  It is guaranteed by
934     * createEmptyExpandedForEachKernel that there will be four parameters.
935     */
936
937    bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
938
939    llvm::Function::arg_iterator ExpandedFunctionArgIter =
940      ExpandedFunction->arg_begin();
941
942    llvm::Value *Arg_p       = &*(ExpandedFunctionArgIter++);
943    llvm::Value *Arg_x1      = &*(ExpandedFunctionArgIter++);
944    llvm::Value *Arg_x2      = &*(ExpandedFunctionArgIter++);
945    // Arg_outstep is not used by expanded new-style forEach kernels.
946
947    // Construct the actual function body.
948    llvm::IRBuilder<> Builder(&*ExpandedFunction->getEntryBlock().begin());
949
950    // Create TBAA meta-data.
951    llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript,
952                 *TBAAAllocation, *TBAAPointer;
953    llvm::MDBuilder MDHelper(*Context);
954
955    TBAARenderScriptDistinct =
956      MDHelper.createTBAARoot(kRenderScriptTBAARootName);
957    TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
958        TBAARenderScriptDistinct);
959    TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
960                                                       TBAARenderScript);
961    TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
962                                                      TBAAAllocation, 0);
963    TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer",
964                                                    TBAARenderScript);
965    TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0);
966
967    /*
968     * Collect and construct the arguments for the kernel().
969     *
970     * Note that we load any loop-invariant arguments before entering the Loop.
971     */
972    size_t NumRemainingInputs = Function->arg_size();
973
974    // No usrData parameter on kernels.
975    bccAssert(
976        !bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature));
977
978    llvm::Function::arg_iterator ArgIter = Function->arg_begin();
979
980    // Check the return type
981    llvm::Type     *OutTy            = nullptr;
982    llvm::LoadInst *OutBasePtr       = nullptr;
983    llvm::Value    *CastedOutBasePtr = nullptr;
984
985    bool PassOutByPointer = false;
986
987    if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
988      llvm::Type *OutBaseTy = Function->getReturnType();
989
990      if (OutBaseTy->isVoidTy()) {
991        PassOutByPointer = true;
992        OutTy = ArgIter->getType();
993
994        ArgIter++;
995        --NumRemainingInputs;
996      } else {
997        // We don't increment Args, since we are using the actual return type.
998        OutTy = OutBaseTy->getPointerTo();
999      }
1000
1001      SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0}));
1002      OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep"));
1003
1004      if (gEnableRsTbaa) {
1005        OutBasePtr->setMetadata("tbaa", TBAAPointer);
1006      }
1007
1008      if (mStructExplicitlyPaddedBySlang || (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING)) {
1009        CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out");
1010      } else {
1011        // The disagreement between module and x86 target machine datalayout
1012        // causes mismatched input/output data offset between slang reflected
1013        // code and bcc codegen for GetElementPtr. To solve this issue, skip the
1014        // cast to OutTy and leave CastedOutBasePtr as an int8_t*.  The buffer
1015        // is later indexed with an explicit byte offset computed based on
1016        // X86_CUSTOM_DL_STRING and then bitcast to actual output type.
1017        CastedOutBasePtr = OutBasePtr;
1018      }
1019    }
1020
1021    llvm::SmallVector<llvm::Type*,  8> InTypes;
1022    llvm::SmallVector<llvm::Value*, 8> InBufPtrs;
1023    llvm::SmallVector<llvm::Value*, 8> InStructTempSlots;
1024
1025    bccAssert(NumRemainingInputs <= RS_KERNEL_INPUT_LIMIT);
1026
1027    // Create the loop structure.
1028    llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock();
1029    llvm::Value *IV;
1030    createLoop(Builder, Arg_x1, Arg_x2, &IV);
1031
1032    llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
1033    const int CalleeArgsContextIdx =
1034      ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
1035                             [&NumRemainingInputs]() { --NumRemainingInputs; },
1036                             LoopHeader->getTerminator());
1037
1038    // After ExpandSpecialArguments() gets called, NumRemainingInputs
1039    // counts the number of arguments to the kernel that correspond to
1040    // an array entry from the InPtr field of the DriverInfo
1041    // structure.
1042    const size_t NumInPtrArguments = NumRemainingInputs;
1043
1044    if (NumInPtrArguments > 0) {
1045      ExpandInputsLoopInvariant(Builder, LoopHeader, Arg_p, TBAAPointer, ArgIter, NumInPtrArguments,
1046                                InTypes, InBufPtrs, InStructTempSlots);
1047    }
1048
1049    // Populate the actual call to kernel().
1050    llvm::SmallVector<llvm::Value*, 8> RootArgs;
1051
1052    // Calculate the current input and output pointers.
1053
1054    // Output
1055
1056    llvm::Value *OutPtr = nullptr;
1057    if (CastedOutBasePtr) {
1058      llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
1059
1060      if (mStructExplicitlyPaddedBySlang || (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING)) {
1061        OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffset);
1062      } else {
1063        // Treat x86 output buffer as byte[], get indexed pointer with explicit
1064        // byte offset computed using a datalayout based on
1065        // X86_CUSTOM_DL_STRING, then bitcast it to actual output type.
1066        uint64_t OutStep = DL.getTypeAllocSize(OutTy->getPointerElementType());
1067        llvm::Value *OutOffsetInBytes = Builder.CreateMul(OutOffset, llvm::ConstantInt::get(Int32Ty, OutStep));
1068        OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffsetInBytes);
1069        OutPtr = Builder.CreatePointerCast(OutPtr, OutTy);
1070      }
1071
1072      if (PassOutByPointer) {
1073        RootArgs.push_back(OutPtr);
1074      }
1075    }
1076
1077    // Inputs
1078
1079    if (NumInPtrArguments > 0) {
1080      ExpandInputsBody(Builder, Arg_x1, TBAAAllocation, NumInPtrArguments,
1081                       InTypes, InBufPtrs, InStructTempSlots, IV, RootArgs);
1082    }
1083
1084    finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder);
1085
1086    llvm::Value *RetVal = Builder.CreateCall(Function, RootArgs);
1087
1088    if (OutPtr && !PassOutByPointer) {
1089      RetVal->setName("call.result");
1090      llvm::StoreInst *Store = Builder.CreateStore(RetVal, OutPtr);
1091      if (gEnableRsTbaa) {
1092        Store->setMetadata("tbaa", TBAAAllocation);
1093      }
1094    }
1095
1096    return true;
1097  }
1098
1099  // Certain categories of functions that make up a general
1100  // reduce-style kernel are called directly from the driver with no
1101  // expansion needed.  For a function in such a category, we need to
1102  // promote linkage from static to external, to ensure that the
1103  // function is visible to the driver in the dynamic symbol table.
1104  // This promotion is safe because we don't have any kind of cross
1105  // translation unit linkage model (except for linking against
1106  // RenderScript libraries), so we do not risk name clashes.
1107  bool PromoteReduceFunction(const char *Name, FunctionSet &PromotedFunctions) {
1108    if (!Name)  // a presumably-optional function that is not present
1109      return false;
1110
1111    llvm::Function *Fn = Module->getFunction(Name);
1112    bccAssert(Fn != nullptr);
1113    if (PromotedFunctions.insert(Fn).second) {
1114      bccAssert(Fn->getLinkage() == llvm::GlobalValue::InternalLinkage);
1115      Fn->setLinkage(llvm::GlobalValue::ExternalLinkage);
1116      return true;
1117    }
1118
1119    return false;
1120  }
1121
1122  // Expand the accumulator function for a general reduce-style kernel.
1123  //
1124  // The input is a function of the form
1125  //
1126  //   define void @func(accumType* %accum, foo1 in1[, ... fooN inN] [, special arguments])
1127  //
1128  // where all arguments except the first are the same as for a foreach kernel.
1129  //
1130  // The input accumulator function gets expanded into a function of the form
1131  //
1132  //   define void @func.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, accumType* %accum)
1133  //
1134  // which performs a serial accumulaion of elements [x1, x2) into *%accum.
1135  //
1136  // In pseudocode, @func.expand does:
1137  //
1138  //   for (i = %x1; i < %x2; ++i) {
1139  //     func(%accum,
1140  //          *((foo1 *)p->inPtr[0] + i)[, ... *((fooN *)p->inPtr[N-1] + i)
1141  //          [, p] [, i] [, p->current.y] [, p->current.z]);
1142  //   }
1143  //
1144  // This is very similar to foreach kernel expansion with no output.
1145  bool ExpandReduceAccumulator(llvm::Function *FnAccumulator, uint32_t Signature, size_t NumInputs) {
1146    ALOGV("Expanding accumulator %s for general reduce kernel",
1147          FnAccumulator->getName().str().c_str());
1148
1149    // Create TBAA meta-data.
1150    llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript,
1151                 *TBAAAllocation, *TBAAPointer;
1152    llvm::MDBuilder MDHelper(*Context);
1153    TBAARenderScriptDistinct =
1154      MDHelper.createTBAARoot(kRenderScriptTBAARootName);
1155    TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
1156        TBAARenderScriptDistinct);
1157    TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
1158                                                       TBAARenderScript);
1159    TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
1160                                                      TBAAAllocation, 0);
1161    TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer",
1162                                                    TBAARenderScript);
1163    TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0);
1164
1165    auto AccumulatorArgIter = FnAccumulator->arg_begin();
1166
1167    // Create empty accumulator function.
1168    llvm::Function *FnExpandedAccumulator =
1169        createEmptyExpandedReduceAccumulator(FnAccumulator->getName(),
1170                                             (AccumulatorArgIter++)->getType());
1171
1172    // Extract the expanded accumulator's parameters.  It is
1173    // guaranteed by createEmptyExpandedReduceAccumulator that
1174    // there will be 4 parameters.
1175    bccAssert(FnExpandedAccumulator->arg_size() == kNumExpandedReduceAccumulatorParams);
1176    auto ExpandedAccumulatorArgIter = FnExpandedAccumulator->arg_begin();
1177    llvm::Value *Arg_p     = &*(ExpandedAccumulatorArgIter++);
1178    llvm::Value *Arg_x1    = &*(ExpandedAccumulatorArgIter++);
1179    llvm::Value *Arg_x2    = &*(ExpandedAccumulatorArgIter++);
1180    llvm::Value *Arg_accum = &*(ExpandedAccumulatorArgIter++);
1181
1182    // Construct the actual function body.
1183    llvm::IRBuilder<> Builder(&*FnExpandedAccumulator->getEntryBlock().begin());
1184
1185    // Create the loop structure.
1186    llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock();
1187    llvm::Value *IndVar;
1188    createLoop(Builder, Arg_x1, Arg_x2, &IndVar);
1189
1190    llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
1191    const int CalleeArgsContextIdx =
1192        ExpandSpecialArguments(Signature, IndVar, Arg_p, Builder, CalleeArgs,
1193                               [](){}, LoopHeader->getTerminator());
1194
1195    llvm::SmallVector<llvm::Type*,  8> InTypes;
1196    llvm::SmallVector<llvm::Value*, 8> InBufPtrs;
1197    llvm::SmallVector<llvm::Value*, 8> InStructTempSlots;
1198    ExpandInputsLoopInvariant(Builder, LoopHeader, Arg_p, TBAAPointer, AccumulatorArgIter, NumInputs,
1199                              InTypes, InBufPtrs, InStructTempSlots);
1200
1201    // Populate the actual call to the original accumulator.
1202    llvm::SmallVector<llvm::Value*, 8> RootArgs;
1203    RootArgs.push_back(Arg_accum);
1204    ExpandInputsBody(Builder, Arg_x1, TBAAAllocation, NumInputs, InTypes, InBufPtrs, InStructTempSlots,
1205                     IndVar, RootArgs);
1206    finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *FnAccumulator, Builder);
1207    Builder.CreateCall(FnAccumulator, RootArgs);
1208
1209    return true;
1210  }
1211
1212  // Create a combiner function for a general reduce-style kernel that lacks one,
1213  // by calling the accumulator function.
1214  //
1215  // The accumulator function must be of the form
1216  //
1217  //   define void @accumFn(accumType* %accum, accumType %in)
1218  //
1219  // A combiner function will be generated of the form
1220  //
1221  //   define void @accumFn.combiner(accumType* %accum, accumType* %other) {
1222  //     %1 = load accumType, accumType* %other
1223  //     call void @accumFn(accumType* %accum, accumType %1);
1224  //   }
1225  bool CreateReduceCombinerFromAccumulator(llvm::Function *FnAccumulator) {
1226    ALOGV("Creating combiner from accumulator %s for general reduce kernel",
1227          FnAccumulator->getName().str().c_str());
1228
1229    using llvm::Attribute;
1230
1231    bccAssert(FnAccumulator->arg_size() == 2);
1232    auto AccumulatorArgIter = FnAccumulator->arg_begin();
1233    llvm::Value *AccumulatorArg_accum = &*(AccumulatorArgIter++);
1234    llvm::Value *AccumulatorArg_in    = &*(AccumulatorArgIter++);
1235    llvm::Type *AccumulatorArgType = AccumulatorArg_accum->getType();
1236    bccAssert(AccumulatorArgType->isPointerTy());
1237
1238    llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context);
1239    llvm::FunctionType *CombinerType =
1240        llvm::FunctionType::get(VoidTy, { AccumulatorArgType, AccumulatorArgType }, false);
1241    llvm::Function *FnCombiner =
1242        llvm::Function::Create(CombinerType, llvm::GlobalValue::ExternalLinkage,
1243                               nameReduceCombinerFromAccumulator(FnAccumulator->getName()),
1244                               Module);
1245
1246    auto CombinerArgIter = FnCombiner->arg_begin();
1247
1248    llvm::Argument *CombinerArg_accum = &(*CombinerArgIter++);
1249    CombinerArg_accum->setName("accum");
1250    CombinerArg_accum->addAttr(llvm::AttributeSet::get(*Context, CombinerArg_accum->getArgNo() + 1,
1251                                                       llvm::makeArrayRef(Attribute::NoCapture)));
1252
1253    llvm::Argument *CombinerArg_other = &(*CombinerArgIter++);
1254    CombinerArg_other->setName("other");
1255    CombinerArg_other->addAttr(llvm::AttributeSet::get(*Context, CombinerArg_other->getArgNo() + 1,
1256                                                       llvm::makeArrayRef(Attribute::NoCapture)));
1257
1258    llvm::BasicBlock *BB = llvm::BasicBlock::Create(*Context, "BB", FnCombiner);
1259    llvm::IRBuilder<> Builder(BB);
1260
1261    if (AccumulatorArg_in->getType()->isPointerTy()) {
1262      // Types of sufficient size get passed by pointer-to-copy rather
1263      // than passed by value.  An accumulator cannot take a pointer
1264      // at the user level; so if we see a pointer here, we know that
1265      // we have a pass-by-pointer-to-copy case.
1266      llvm::Type *ElementType = AccumulatorArg_in->getType()->getPointerElementType();
1267      llvm::Value *TempMem = Builder.CreateAlloca(ElementType, nullptr, "caller_copy");
1268      Builder.CreateStore(Builder.CreateLoad(CombinerArg_other), TempMem);
1269      Builder.CreateCall(FnAccumulator, { CombinerArg_accum, TempMem });
1270    } else {
1271      llvm::Value *TypeAdjustedOther = CombinerArg_other;
1272      if (AccumulatorArgType->getPointerElementType() != AccumulatorArg_in->getType()) {
1273        // Call lowering by frontend has done some type coercion
1274        TypeAdjustedOther = Builder.CreatePointerCast(CombinerArg_other,
1275                                                      AccumulatorArg_in->getType()->getPointerTo(),
1276                                                      "cast");
1277      }
1278      llvm::Value *DerefOther = Builder.CreateLoad(TypeAdjustedOther);
1279      Builder.CreateCall(FnAccumulator, { CombinerArg_accum, DerefOther });
1280    }
1281    Builder.CreateRetVoid();
1282
1283    return true;
1284  }
1285
1286  /// @brief Checks if pointers to allocation internals are exposed
1287  ///
1288  /// This function verifies if through the parameters passed to the kernel
1289  /// or through calls to the runtime library the script gains access to
1290  /// pointers pointing to data within a RenderScript Allocation.
1291  /// If we know we control all loads from and stores to data within
1292  /// RenderScript allocations and if we know the run-time internal accesses
1293  /// are all annotated with RenderScript TBAA metadata, only then we
1294  /// can safely use TBAA to distinguish between generic and from-allocation
1295  /// pointers.
1296  bool allocPointersExposed(llvm::Module &Module) {
1297    // Old style kernel function can expose pointers to elements within
1298    // allocations.
1299    // TODO: Extend analysis to allow simple cases of old-style kernels.
1300    for (size_t i = 0; i < mExportForEachCount; ++i) {
1301      const char *Name = mExportForEachNameList[i];
1302      uint32_t Signature = mExportForEachSignatureList[i];
1303      if (Module.getFunction(Name) &&
1304          !bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)) {
1305        return true;
1306      }
1307    }
1308
1309    // Check for library functions that expose a pointer to an Allocation or
1310    // that are not yet annotated with RenderScript-specific tbaa information.
1311    static const std::vector<const char *> Funcs{
1312      // rsGetElementAt(...)
1313      "_Z14rsGetElementAt13rs_allocationj",
1314      "_Z14rsGetElementAt13rs_allocationjj",
1315      "_Z14rsGetElementAt13rs_allocationjjj",
1316
1317      // rsSetElementAt()
1318      "_Z14rsSetElementAt13rs_allocationPvj",
1319      "_Z14rsSetElementAt13rs_allocationPvjj",
1320      "_Z14rsSetElementAt13rs_allocationPvjjj",
1321
1322      // rsGetElementAtYuv_uchar_Y()
1323      "_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj",
1324
1325      // rsGetElementAtYuv_uchar_U()
1326      "_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj",
1327
1328      // rsGetElementAtYuv_uchar_V()
1329      "_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj",
1330    };
1331
1332    for (auto FI : Funcs) {
1333      llvm::Function *Function = Module.getFunction(FI);
1334
1335      if (!Function) {
1336        ALOGE("Missing run-time function '%s'", FI);
1337        return true;
1338      }
1339
1340      if (Function->getNumUses() > 0) {
1341        return true;
1342      }
1343    }
1344
1345    return false;
1346  }
1347
1348  /// @brief Connect RenderScript TBAA metadata to C/C++ metadata
1349  ///
1350  /// The TBAA metadata used to annotate loads/stores from RenderScript
1351  /// Allocations is generated in a separate TBAA tree with a
1352  /// "RenderScript Distinct TBAA" root node. LLVM does assume may-alias for
1353  /// all nodes in unrelated alias analysis trees. This function makes the
1354  /// "RenderScript TBAA" node (which is parented by the Distinct TBAA root),
1355  /// a subtree of the normal C/C++ TBAA tree aside of normal C/C++ types. With
1356  /// the connected trees every access to an Allocation is resolved to
1357  /// must-alias if compared to a normal C/C++ access.
1358  void connectRenderScriptTBAAMetadata(llvm::Module &Module) {
1359    llvm::MDBuilder MDHelper(*Context);
1360    llvm::MDNode *TBAARenderScriptDistinct =
1361      MDHelper.createTBAARoot("RenderScript Distinct TBAA");
1362    llvm::MDNode *TBAARenderScript = MDHelper.createTBAANode(
1363        "RenderScript TBAA", TBAARenderScriptDistinct);
1364    llvm::MDNode *TBAARoot     = MDHelper.createTBAARoot("Simple C/C++ TBAA");
1365    TBAARenderScript->replaceOperandWith(1, TBAARoot);
1366  }
1367
1368  virtual bool runOnModule(llvm::Module &Module) {
1369    bool Changed  = false;
1370    this->Module  = &Module;
1371    Context = &Module.getContext();
1372
1373    buildTypes();
1374
1375    bcinfo::MetadataExtractor me(&Module);
1376    if (!me.extract()) {
1377      ALOGE("Could not extract metadata from module!");
1378      return false;
1379    }
1380
1381    mStructExplicitlyPaddedBySlang = (me.getCompilerVersion() >= SlangVersion::N_STRUCT_EXPLICIT_PADDING);
1382
1383    // Expand forEach_* style kernels.
1384    mExportForEachCount = me.getExportForEachSignatureCount();
1385    mExportForEachNameList = me.getExportForEachNameList();
1386    mExportForEachSignatureList = me.getExportForEachSignatureList();
1387
1388    for (size_t i = 0; i < mExportForEachCount; ++i) {
1389      const char *name = mExportForEachNameList[i];
1390      uint32_t signature = mExportForEachSignatureList[i];
1391      llvm::Function *kernel = Module.getFunction(name);
1392      if (kernel) {
1393        if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) {
1394          Changed |= ExpandForEach(kernel, signature);
1395          kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
1396        } else if (kernel->getReturnType()->isVoidTy()) {
1397          Changed |= ExpandOldStyleForEach(kernel, signature);
1398          kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
1399        } else {
1400          // There are some graphics root functions that are not
1401          // expanded, but that will be called directly. For those
1402          // functions, we can not set the linkage to internal.
1403        }
1404      }
1405    }
1406
1407    // Process general reduce_* style functions.
1408    const size_t ExportReduceCount = me.getExportReduceCount();
1409    const bcinfo::MetadataExtractor::Reduce *ExportReduceList = me.getExportReduceList();
1410    //   Note that functions can be shared between kernels
1411    FunctionSet PromotedFunctions, ExpandedAccumulators, AccumulatorsForCombiners;
1412
1413    for (size_t i = 0; i < ExportReduceCount; ++i) {
1414      Changed |= PromoteReduceFunction(ExportReduceList[i].mInitializerName, PromotedFunctions);
1415      Changed |= PromoteReduceFunction(ExportReduceList[i].mCombinerName, PromotedFunctions);
1416      Changed |= PromoteReduceFunction(ExportReduceList[i].mOutConverterName, PromotedFunctions);
1417
1418      // Accumulator
1419      llvm::Function *accumulator = Module.getFunction(ExportReduceList[i].mAccumulatorName);
1420      bccAssert(accumulator != nullptr);
1421      if (ExpandedAccumulators.insert(accumulator).second)
1422        Changed |= ExpandReduceAccumulator(accumulator,
1423                                           ExportReduceList[i].mSignature,
1424                                           ExportReduceList[i].mInputCount);
1425      if (!ExportReduceList[i].mCombinerName) {
1426        if (AccumulatorsForCombiners.insert(accumulator).second)
1427          Changed |= CreateReduceCombinerFromAccumulator(accumulator);
1428      }
1429    }
1430
1431    if (gEnableRsTbaa && !allocPointersExposed(Module)) {
1432      connectRenderScriptTBAAMetadata(Module);
1433    }
1434
1435    return Changed;
1436  }
1437
1438  virtual const char *getPassName() const {
1439    return "forEach_* and reduce_* function expansion";
1440  }
1441
1442}; // end RSKernelExpandPass
1443
1444} // end anonymous namespace
1445
1446char RSKernelExpandPass::ID = 0;
1447static llvm::RegisterPass<RSKernelExpandPass> X("kernelexp", "Kernel Expand Pass");
1448
1449namespace bcc {
1450
1451const char BCC_INDEX_VAR_NAME[] = "rsIndex";
1452
1453llvm::ModulePass *
1454createRSKernelExpandPass(bool pEnableStepOpt) {
1455  return new RSKernelExpandPass(pEnableStepOpt);
1456}
1457
1458} // end namespace bcc
1459