1//===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9
10#include "PPCTargetTransformInfo.h"
11#include "llvm/Analysis/TargetTransformInfo.h"
12#include "llvm/CodeGen/BasicTTIImpl.h"
13#include "llvm/Support/CommandLine.h"
14#include "llvm/Support/Debug.h"
15#include "llvm/Target/CostTable.h"
16#include "llvm/Target/TargetLowering.h"
17using namespace llvm;
18
19#define DEBUG_TYPE "ppctti"
20
21static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
22cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
23
24//===----------------------------------------------------------------------===//
25//
26// PPC cost model.
27//
28//===----------------------------------------------------------------------===//
29
30TargetTransformInfo::PopcntSupportKind
31PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
32  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
33  if (ST->hasPOPCNTD() && TyWidth <= 64)
34    return TTI::PSK_FastHardware;
35  return TTI::PSK_Software;
36}
37
38unsigned PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
39  if (DisablePPCConstHoist)
40    return BaseT::getIntImmCost(Imm, Ty);
41
42  assert(Ty->isIntegerTy());
43
44  unsigned BitSize = Ty->getPrimitiveSizeInBits();
45  if (BitSize == 0)
46    return ~0U;
47
48  if (Imm == 0)
49    return TTI::TCC_Free;
50
51  if (Imm.getBitWidth() <= 64) {
52    if (isInt<16>(Imm.getSExtValue()))
53      return TTI::TCC_Basic;
54
55    if (isInt<32>(Imm.getSExtValue())) {
56      // A constant that can be materialized using lis.
57      if ((Imm.getZExtValue() & 0xFFFF) == 0)
58        return TTI::TCC_Basic;
59
60      return 2 * TTI::TCC_Basic;
61    }
62  }
63
64  return 4 * TTI::TCC_Basic;
65}
66
67unsigned PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
68                                   const APInt &Imm, Type *Ty) {
69  if (DisablePPCConstHoist)
70    return BaseT::getIntImmCost(IID, Idx, Imm, Ty);
71
72  assert(Ty->isIntegerTy());
73
74  unsigned BitSize = Ty->getPrimitiveSizeInBits();
75  if (BitSize == 0)
76    return ~0U;
77
78  switch (IID) {
79  default:
80    return TTI::TCC_Free;
81  case Intrinsic::sadd_with_overflow:
82  case Intrinsic::uadd_with_overflow:
83  case Intrinsic::ssub_with_overflow:
84  case Intrinsic::usub_with_overflow:
85    if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
86      return TTI::TCC_Free;
87    break;
88  case Intrinsic::experimental_stackmap:
89    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
90      return TTI::TCC_Free;
91    break;
92  case Intrinsic::experimental_patchpoint_void:
93  case Intrinsic::experimental_patchpoint_i64:
94    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
95      return TTI::TCC_Free;
96    break;
97  }
98  return PPCTTIImpl::getIntImmCost(Imm, Ty);
99}
100
101unsigned PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
102                                   const APInt &Imm, Type *Ty) {
103  if (DisablePPCConstHoist)
104    return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty);
105
106  assert(Ty->isIntegerTy());
107
108  unsigned BitSize = Ty->getPrimitiveSizeInBits();
109  if (BitSize == 0)
110    return ~0U;
111
112  unsigned ImmIdx = ~0U;
113  bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
114       ZeroFree = false;
115  switch (Opcode) {
116  default:
117    return TTI::TCC_Free;
118  case Instruction::GetElementPtr:
119    // Always hoist the base address of a GetElementPtr. This prevents the
120    // creation of new constants for every base constant that gets constant
121    // folded with the offset.
122    if (Idx == 0)
123      return 2 * TTI::TCC_Basic;
124    return TTI::TCC_Free;
125  case Instruction::And:
126    RunFree = true; // (for the rotate-and-mask instructions)
127    // Fallthrough...
128  case Instruction::Add:
129  case Instruction::Or:
130  case Instruction::Xor:
131    ShiftedFree = true;
132    // Fallthrough...
133  case Instruction::Sub:
134  case Instruction::Mul:
135  case Instruction::Shl:
136  case Instruction::LShr:
137  case Instruction::AShr:
138    ImmIdx = 1;
139    break;
140  case Instruction::ICmp:
141    UnsignedFree = true;
142    ImmIdx = 1;
143    // Fallthrough... (zero comparisons can use record-form instructions)
144  case Instruction::Select:
145    ZeroFree = true;
146    break;
147  case Instruction::PHI:
148  case Instruction::Call:
149  case Instruction::Ret:
150  case Instruction::Load:
151  case Instruction::Store:
152    break;
153  }
154
155  if (ZeroFree && Imm == 0)
156    return TTI::TCC_Free;
157
158  if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
159    if (isInt<16>(Imm.getSExtValue()))
160      return TTI::TCC_Free;
161
162    if (RunFree) {
163      if (Imm.getBitWidth() <= 32 &&
164          (isShiftedMask_32(Imm.getZExtValue()) ||
165           isShiftedMask_32(~Imm.getZExtValue())))
166        return TTI::TCC_Free;
167
168      if (ST->isPPC64() &&
169          (isShiftedMask_64(Imm.getZExtValue()) ||
170           isShiftedMask_64(~Imm.getZExtValue())))
171        return TTI::TCC_Free;
172    }
173
174    if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
175      return TTI::TCC_Free;
176
177    if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
178      return TTI::TCC_Free;
179  }
180
181  return PPCTTIImpl::getIntImmCost(Imm, Ty);
182}
183
184void PPCTTIImpl::getUnrollingPreferences(Loop *L,
185                                         TTI::UnrollingPreferences &UP) {
186  if (ST->getDarwinDirective() == PPC::DIR_A2) {
187    // The A2 is in-order with a deep pipeline, and concatenation unrolling
188    // helps expose latency-hiding opportunities to the instruction scheduler.
189    UP.Partial = UP.Runtime = true;
190  }
191
192  BaseT::getUnrollingPreferences(L, UP);
193}
194
195bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
196  return LoopHasReductions;
197}
198
199unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
200  if (Vector && !ST->hasAltivec() && !ST->hasQPX())
201    return 0;
202  return ST->hasVSX() ? 64 : 32;
203}
204
205unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) {
206  if (Vector) {
207    if (ST->hasQPX()) return 256;
208    if (ST->hasAltivec()) return 128;
209    return 0;
210  }
211
212  if (ST->isPPC64())
213    return 64;
214  return 32;
215
216}
217
218unsigned PPCTTIImpl::getMaxInterleaveFactor() {
219  unsigned Directive = ST->getDarwinDirective();
220  // The 440 has no SIMD support, but floating-point instructions
221  // have a 5-cycle latency, so unroll by 5x for latency hiding.
222  if (Directive == PPC::DIR_440)
223    return 5;
224
225  // The A2 has no SIMD support, but floating-point instructions
226  // have a 6-cycle latency, so unroll by 6x for latency hiding.
227  if (Directive == PPC::DIR_A2)
228    return 6;
229
230  // FIXME: For lack of any better information, do no harm...
231  if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500)
232    return 1;
233
234  // For P7 and P8, floating-point instructions have a 6-cycle latency and
235  // there are two execution units, so unroll by 12x for latency hiding.
236  if (Directive == PPC::DIR_PWR7 ||
237      Directive == PPC::DIR_PWR8)
238    return 12;
239
240  // For most things, modern systems have two execution units (and
241  // out-of-order execution).
242  return 2;
243}
244
245unsigned PPCTTIImpl::getArithmeticInstrCost(
246    unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
247    TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
248    TTI::OperandValueProperties Opd2PropInfo) {
249  assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
250
251  // Fallback to the default implementation.
252  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
253                                       Opd1PropInfo, Opd2PropInfo);
254}
255
256unsigned PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
257                                    Type *SubTp) {
258  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
259}
260
261unsigned PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
262  assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
263
264  return BaseT::getCastInstrCost(Opcode, Dst, Src);
265}
266
267unsigned PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
268                                        Type *CondTy) {
269  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
270}
271
272unsigned PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
273                                        unsigned Index) {
274  assert(Val->isVectorTy() && "This must be a vector type");
275
276  int ISD = TLI->InstructionOpcodeToISD(Opcode);
277  assert(ISD && "Invalid opcode");
278
279  if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
280    // Double-precision scalars are already located in index #0.
281    if (Index == 0)
282      return 0;
283
284    return BaseT::getVectorInstrCost(Opcode, Val, Index);
285  } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) {
286    // Floating point scalars are already located in index #0.
287    if (Index == 0)
288      return 0;
289
290    return BaseT::getVectorInstrCost(Opcode, Val, Index);
291  }
292
293  // Estimated cost of a load-hit-store delay.  This was obtained
294  // experimentally as a minimum needed to prevent unprofitable
295  // vectorization for the paq8p benchmark.  It may need to be
296  // raised further if other unprofitable cases remain.
297  unsigned LHSPenalty = 2;
298  if (ISD == ISD::INSERT_VECTOR_ELT)
299    LHSPenalty += 7;
300
301  // Vector element insert/extract with Altivec is very expensive,
302  // because they require store and reload with the attendant
303  // processor stall for load-hit-store.  Until VSX is available,
304  // these need to be estimated as very costly.
305  if (ISD == ISD::EXTRACT_VECTOR_ELT ||
306      ISD == ISD::INSERT_VECTOR_ELT)
307    return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index);
308
309  return BaseT::getVectorInstrCost(Opcode, Val, Index);
310}
311
312unsigned PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
313                                     unsigned Alignment,
314                                     unsigned AddressSpace) {
315  // Legalize the type.
316  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
317  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
318         "Invalid Opcode");
319
320  unsigned Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
321
322  // VSX loads/stores support unaligned access.
323  if (ST->hasVSX()) {
324    if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64)
325      return Cost;
326  }
327
328  bool UnalignedAltivec =
329    Src->isVectorTy() &&
330    Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() &&
331    LT.second.getSizeInBits() == 128 &&
332    Opcode == Instruction::Load;
333
334  // PPC in general does not support unaligned loads and stores. They'll need
335  // to be decomposed based on the alignment factor.
336  unsigned SrcBytes = LT.second.getStoreSize();
337  if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) {
338    Cost += LT.first*(SrcBytes/Alignment-1);
339
340    // For a vector type, there is also scalarization overhead (only for
341    // stores, loads are expanded using the vector-load + permutation sequence,
342    // which is much less expensive).
343    if (Src->isVectorTy() && Opcode == Instruction::Store)
344      for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
345        Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
346  }
347
348  return Cost;
349}
350
351