1//===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9
10#include "PPCTargetTransformInfo.h"
11#include "llvm/Analysis/TargetTransformInfo.h"
12#include "llvm/CodeGen/BasicTTIImpl.h"
13#include "llvm/Support/CommandLine.h"
14#include "llvm/Support/Debug.h"
15#include "llvm/Target/CostTable.h"
16#include "llvm/Target/TargetLowering.h"
17using namespace llvm;
18
19#define DEBUG_TYPE "ppctti"
20
21static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
22cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
23
24//===----------------------------------------------------------------------===//
25//
26// PPC cost model.
27//
28//===----------------------------------------------------------------------===//
29
30TargetTransformInfo::PopcntSupportKind
31PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
32  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
33  if (ST->hasPOPCNTD() && TyWidth <= 64)
34    return TTI::PSK_FastHardware;
35  return TTI::PSK_Software;
36}
37
38int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
39  if (DisablePPCConstHoist)
40    return BaseT::getIntImmCost(Imm, Ty);
41
42  assert(Ty->isIntegerTy());
43
44  unsigned BitSize = Ty->getPrimitiveSizeInBits();
45  if (BitSize == 0)
46    return ~0U;
47
48  if (Imm == 0)
49    return TTI::TCC_Free;
50
51  if (Imm.getBitWidth() <= 64) {
52    if (isInt<16>(Imm.getSExtValue()))
53      return TTI::TCC_Basic;
54
55    if (isInt<32>(Imm.getSExtValue())) {
56      // A constant that can be materialized using lis.
57      if ((Imm.getZExtValue() & 0xFFFF) == 0)
58        return TTI::TCC_Basic;
59
60      return 2 * TTI::TCC_Basic;
61    }
62  }
63
64  return 4 * TTI::TCC_Basic;
65}
66
67int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
68                              Type *Ty) {
69  if (DisablePPCConstHoist)
70    return BaseT::getIntImmCost(IID, Idx, Imm, Ty);
71
72  assert(Ty->isIntegerTy());
73
74  unsigned BitSize = Ty->getPrimitiveSizeInBits();
75  if (BitSize == 0)
76    return ~0U;
77
78  switch (IID) {
79  default:
80    return TTI::TCC_Free;
81  case Intrinsic::sadd_with_overflow:
82  case Intrinsic::uadd_with_overflow:
83  case Intrinsic::ssub_with_overflow:
84  case Intrinsic::usub_with_overflow:
85    if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
86      return TTI::TCC_Free;
87    break;
88  case Intrinsic::experimental_stackmap:
89    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
90      return TTI::TCC_Free;
91    break;
92  case Intrinsic::experimental_patchpoint_void:
93  case Intrinsic::experimental_patchpoint_i64:
94    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
95      return TTI::TCC_Free;
96    break;
97  }
98  return PPCTTIImpl::getIntImmCost(Imm, Ty);
99}
100
101int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
102                              Type *Ty) {
103  if (DisablePPCConstHoist)
104    return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty);
105
106  assert(Ty->isIntegerTy());
107
108  unsigned BitSize = Ty->getPrimitiveSizeInBits();
109  if (BitSize == 0)
110    return ~0U;
111
112  unsigned ImmIdx = ~0U;
113  bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
114       ZeroFree = false;
115  switch (Opcode) {
116  default:
117    return TTI::TCC_Free;
118  case Instruction::GetElementPtr:
119    // Always hoist the base address of a GetElementPtr. This prevents the
120    // creation of new constants for every base constant that gets constant
121    // folded with the offset.
122    if (Idx == 0)
123      return 2 * TTI::TCC_Basic;
124    return TTI::TCC_Free;
125  case Instruction::And:
126    RunFree = true; // (for the rotate-and-mask instructions)
127    // Fallthrough...
128  case Instruction::Add:
129  case Instruction::Or:
130  case Instruction::Xor:
131    ShiftedFree = true;
132    // Fallthrough...
133  case Instruction::Sub:
134  case Instruction::Mul:
135  case Instruction::Shl:
136  case Instruction::LShr:
137  case Instruction::AShr:
138    ImmIdx = 1;
139    break;
140  case Instruction::ICmp:
141    UnsignedFree = true;
142    ImmIdx = 1;
143    // Fallthrough... (zero comparisons can use record-form instructions)
144  case Instruction::Select:
145    ZeroFree = true;
146    break;
147  case Instruction::PHI:
148  case Instruction::Call:
149  case Instruction::Ret:
150  case Instruction::Load:
151  case Instruction::Store:
152    break;
153  }
154
155  if (ZeroFree && Imm == 0)
156    return TTI::TCC_Free;
157
158  if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
159    if (isInt<16>(Imm.getSExtValue()))
160      return TTI::TCC_Free;
161
162    if (RunFree) {
163      if (Imm.getBitWidth() <= 32 &&
164          (isShiftedMask_32(Imm.getZExtValue()) ||
165           isShiftedMask_32(~Imm.getZExtValue())))
166        return TTI::TCC_Free;
167
168      if (ST->isPPC64() &&
169          (isShiftedMask_64(Imm.getZExtValue()) ||
170           isShiftedMask_64(~Imm.getZExtValue())))
171        return TTI::TCC_Free;
172    }
173
174    if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
175      return TTI::TCC_Free;
176
177    if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
178      return TTI::TCC_Free;
179  }
180
181  return PPCTTIImpl::getIntImmCost(Imm, Ty);
182}
183
184void PPCTTIImpl::getUnrollingPreferences(Loop *L,
185                                         TTI::UnrollingPreferences &UP) {
186  if (ST->getDarwinDirective() == PPC::DIR_A2) {
187    // The A2 is in-order with a deep pipeline, and concatenation unrolling
188    // helps expose latency-hiding opportunities to the instruction scheduler.
189    UP.Partial = UP.Runtime = true;
190
191    // We unroll a lot on the A2 (hundreds of instructions), and the benefits
192    // often outweigh the cost of a division to compute the trip count.
193    UP.AllowExpensiveTripCount = true;
194  }
195
196  BaseT::getUnrollingPreferences(L, UP);
197}
198
199bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
200  // On the A2, always unroll aggressively. For QPX unaligned loads, we depend
201  // on combining the loads generated for consecutive accesses, and failure to
202  // do so is particularly expensive. This makes it much more likely (compared
203  // to only using concatenation unrolling).
204  if (ST->getDarwinDirective() == PPC::DIR_A2)
205    return true;
206
207  return LoopHasReductions;
208}
209
210bool PPCTTIImpl::enableInterleavedAccessVectorization() {
211  return true;
212}
213
214unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
215  if (Vector && !ST->hasAltivec() && !ST->hasQPX())
216    return 0;
217  return ST->hasVSX() ? 64 : 32;
218}
219
220unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) {
221  if (Vector) {
222    if (ST->hasQPX()) return 256;
223    if (ST->hasAltivec()) return 128;
224    return 0;
225  }
226
227  if (ST->isPPC64())
228    return 64;
229  return 32;
230
231}
232
233unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
234  unsigned Directive = ST->getDarwinDirective();
235  // The 440 has no SIMD support, but floating-point instructions
236  // have a 5-cycle latency, so unroll by 5x for latency hiding.
237  if (Directive == PPC::DIR_440)
238    return 5;
239
240  // The A2 has no SIMD support, but floating-point instructions
241  // have a 6-cycle latency, so unroll by 6x for latency hiding.
242  if (Directive == PPC::DIR_A2)
243    return 6;
244
245  // FIXME: For lack of any better information, do no harm...
246  if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500)
247    return 1;
248
249  // For P7 and P8, floating-point instructions have a 6-cycle latency and
250  // there are two execution units, so unroll by 12x for latency hiding.
251  if (Directive == PPC::DIR_PWR7 ||
252      Directive == PPC::DIR_PWR8)
253    return 12;
254
255  // For most things, modern systems have two execution units (and
256  // out-of-order execution).
257  return 2;
258}
259
260int PPCTTIImpl::getArithmeticInstrCost(
261    unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
262    TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
263    TTI::OperandValueProperties Opd2PropInfo) {
264  assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
265
266  // Fallback to the default implementation.
267  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
268                                       Opd1PropInfo, Opd2PropInfo);
269}
270
271int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
272                               Type *SubTp) {
273  // Legalize the type.
274  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
275
276  // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
277  // (at least in the sense that there need only be one non-loop-invariant
278  // instruction). We need one such shuffle instruction for each actual
279  // register (this is not true for arbitrary shuffles, but is true for the
280  // structured types of shuffles covered by TTI::ShuffleKind).
281  return LT.first;
282}
283
284int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
285  assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
286
287  return BaseT::getCastInstrCost(Opcode, Dst, Src);
288}
289
290int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
291  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
292}
293
294int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
295  assert(Val->isVectorTy() && "This must be a vector type");
296
297  int ISD = TLI->InstructionOpcodeToISD(Opcode);
298  assert(ISD && "Invalid opcode");
299
300  if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
301    // Double-precision scalars are already located in index #0.
302    if (Index == 0)
303      return 0;
304
305    return BaseT::getVectorInstrCost(Opcode, Val, Index);
306  } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) {
307    // Floating point scalars are already located in index #0.
308    if (Index == 0)
309      return 0;
310
311    return BaseT::getVectorInstrCost(Opcode, Val, Index);
312  }
313
314  // Estimated cost of a load-hit-store delay.  This was obtained
315  // experimentally as a minimum needed to prevent unprofitable
316  // vectorization for the paq8p benchmark.  It may need to be
317  // raised further if other unprofitable cases remain.
318  unsigned LHSPenalty = 2;
319  if (ISD == ISD::INSERT_VECTOR_ELT)
320    LHSPenalty += 7;
321
322  // Vector element insert/extract with Altivec is very expensive,
323  // because they require store and reload with the attendant
324  // processor stall for load-hit-store.  Until VSX is available,
325  // these need to be estimated as very costly.
326  if (ISD == ISD::EXTRACT_VECTOR_ELT ||
327      ISD == ISD::INSERT_VECTOR_ELT)
328    return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index);
329
330  return BaseT::getVectorInstrCost(Opcode, Val, Index);
331}
332
333int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
334                                unsigned AddressSpace) {
335  // Legalize the type.
336  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
337  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
338         "Invalid Opcode");
339
340  int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
341
342  // Aligned loads and stores are easy.
343  unsigned SrcBytes = LT.second.getStoreSize();
344  if (!SrcBytes || !Alignment || Alignment >= SrcBytes)
345    return Cost;
346
347  bool IsAltivecType = ST->hasAltivec() &&
348                       (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
349                        LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
350  bool IsVSXType = ST->hasVSX() &&
351                   (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
352  bool IsQPXType = ST->hasQPX() &&
353                   (LT.second == MVT::v4f64 || LT.second == MVT::v4f32);
354
355  // If we can use the permutation-based load sequence, then this is also
356  // relatively cheap (not counting loop-invariant instructions): one load plus
357  // one permute (the last load in a series has extra cost, but we're
358  // neglecting that here). Note that on the P7, we should do unaligned loads
359  // for Altivec types using the VSX instructions, but that's more expensive
360  // than using the permutation-based load sequence. On the P8, that's no
361  // longer true.
362  if (Opcode == Instruction::Load &&
363      ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) &&
364      Alignment >= LT.second.getScalarType().getStoreSize())
365    return Cost + LT.first; // Add the cost of the permutations.
366
367  // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
368  // P7, unaligned vector loads are more expensive than the permutation-based
369  // load sequence, so that might be used instead, but regardless, the net cost
370  // is about the same (not counting loop-invariant instructions).
371  if (IsVSXType || (ST->hasVSX() && IsAltivecType))
372    return Cost;
373
374  // PPC in general does not support unaligned loads and stores. They'll need
375  // to be decomposed based on the alignment factor.
376
377  // Add the cost of each scalar load or store.
378  Cost += LT.first*(SrcBytes/Alignment-1);
379
380  // For a vector type, there is also scalarization overhead (only for
381  // stores, loads are expanded using the vector-load + permutation sequence,
382  // which is much less expensive).
383  if (Src->isVectorTy() && Opcode == Instruction::Store)
384    for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
385      Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
386
387  return Cost;
388}
389
390int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
391                                           unsigned Factor,
392                                           ArrayRef<unsigned> Indices,
393                                           unsigned Alignment,
394                                           unsigned AddressSpace) {
395  assert(isa<VectorType>(VecTy) &&
396         "Expect a vector type for interleaved memory op");
397
398  // Legalize the type.
399  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
400
401  // Firstly, the cost of load/store operation.
402  int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
403
404  // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
405  // (at least in the sense that there need only be one non-loop-invariant
406  // instruction). For each result vector, we need one shuffle per incoming
407  // vector (except that the first shuffle can take two incoming vectors
408  // because it does not need to take itself).
409  Cost += Factor*(LT.first-1);
410
411  return Cost;
412}
413
414