LoopStrengthReduce.cpp revision 701a4aef7fa0ece4dc1fdbc88b981820564cb4e4
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This transformation analyzes and transforms the induction variables (and
11// computations derived from them) into forms suitable for efficient execution
12// on the target.
13//
14// This pass performs a strength reduction on array references inside loops that
15// have as one or more of their components the loop induction variable, it
16// rewrites expressions to take advantage of scaled-index addressing modes
17// available on the target, and it performs a variety of other optimizations
18// related to loop induction variables.
19//
20// Terminology note: this code has a lot of handling for "post-increment" or
21// "post-inc" users. This is not talking about post-increment addressing modes;
22// it is instead talking about code like this:
23//
24//   %i = phi [ 0, %entry ], [ %i.next, %latch ]
25//   ...
26//   %i.next = add %i, 1
27//   %c = icmp eq %i.next, %n
28//
29// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
30// it's useful to think about these as the same register, with some uses using
31// the value of the register before the add and some using // it after. In this
32// example, the icmp is a post-increment user, since it uses %i.next, which is
33// the value of the induction variable after the increment. The other common
34// case of post-increment users is users outside the loop.
35//
36// TODO: More sophistication in the way Formulae are generated and filtered.
37//
38// TODO: Handle multiple loops at a time.
39//
40// TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr
41//       instead of a GlobalValue?
42//
43// TODO: When truncation is free, truncate ICmp users' operands to make it a
44//       smaller encoding (on x86 at least).
45//
46// TODO: When a negated register is used by an add (such as in a list of
47//       multiple base registers, or as the increment expression in an addrec),
48//       we may not actually need both reg and (-1 * reg) in registers; the
49//       negation can be implemented by using a sub instead of an add. The
50//       lack of support for taking this into consideration when making
51//       register pressure decisions is partly worked around by the "Special"
52//       use kind.
53//
54//===----------------------------------------------------------------------===//
55
56#define DEBUG_TYPE "loop-reduce"
57#include "llvm/Transforms/Scalar.h"
58#include "llvm/Constants.h"
59#include "llvm/Instructions.h"
60#include "llvm/IntrinsicInst.h"
61#include "llvm/DerivedTypes.h"
62#include "llvm/Analysis/IVUsers.h"
63#include "llvm/Analysis/Dominators.h"
64#include "llvm/Analysis/LoopPass.h"
65#include "llvm/Analysis/ScalarEvolutionExpander.h"
66#include "llvm/Transforms/Utils/BasicBlockUtils.h"
67#include "llvm/Transforms/Utils/Local.h"
68#include "llvm/ADT/SmallBitVector.h"
69#include "llvm/ADT/SetVector.h"
70#include "llvm/ADT/DenseSet.h"
71#include "llvm/Support/Debug.h"
72#include "llvm/Support/ValueHandle.h"
73#include "llvm/Support/raw_ostream.h"
74#include "llvm/Target/TargetLowering.h"
75#include <algorithm>
76using namespace llvm;
77
78namespace {
79
80/// RegSortData - This class holds data which is used to order reuse candidates.
81class RegSortData {
82public:
83  /// UsedByIndices - This represents the set of LSRUse indices which reference
84  /// a particular register.
85  SmallBitVector UsedByIndices;
86
87  RegSortData() {}
88
89  void print(raw_ostream &OS) const;
90  void dump() const;
91};
92
93}
94
95void RegSortData::print(raw_ostream &OS) const {
96  OS << "[NumUses=" << UsedByIndices.count() << ']';
97}
98
99void RegSortData::dump() const {
100  print(errs()); errs() << '\n';
101}
102
103namespace {
104
105/// RegUseTracker - Map register candidates to information about how they are
106/// used.
107class RegUseTracker {
108  typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
109
110  RegUsesTy RegUses;
111  SmallVector<const SCEV *, 16> RegSequence;
112
113public:
114  void CountRegister(const SCEV *Reg, size_t LUIdx);
115
116  bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
117
118  const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
119
120  void clear();
121
122  typedef SmallVectorImpl<const SCEV *>::iterator iterator;
123  typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator;
124  iterator begin() { return RegSequence.begin(); }
125  iterator end()   { return RegSequence.end(); }
126  const_iterator begin() const { return RegSequence.begin(); }
127  const_iterator end() const   { return RegSequence.end(); }
128};
129
130}
131
132void
133RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) {
134  std::pair<RegUsesTy::iterator, bool> Pair =
135    RegUses.insert(std::make_pair(Reg, RegSortData()));
136  RegSortData &RSD = Pair.first->second;
137  if (Pair.second)
138    RegSequence.push_back(Reg);
139  RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
140  RSD.UsedByIndices.set(LUIdx);
141}
142
143bool
144RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
145  if (!RegUses.count(Reg)) return false;
146  const SmallBitVector &UsedByIndices =
147    RegUses.find(Reg)->second.UsedByIndices;
148  int i = UsedByIndices.find_first();
149  if (i == -1) return false;
150  if ((size_t)i != LUIdx) return true;
151  return UsedByIndices.find_next(i) != -1;
152}
153
154const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
155  RegUsesTy::const_iterator I = RegUses.find(Reg);
156  assert(I != RegUses.end() && "Unknown register!");
157  return I->second.UsedByIndices;
158}
159
160void RegUseTracker::clear() {
161  RegUses.clear();
162  RegSequence.clear();
163}
164
165namespace {
166
167/// Formula - This class holds information that describes a formula for
168/// computing satisfying a use. It may include broken-out immediates and scaled
169/// registers.
170struct Formula {
171  /// AM - This is used to represent complex addressing, as well as other kinds
172  /// of interesting uses.
173  TargetLowering::AddrMode AM;
174
175  /// BaseRegs - The list of "base" registers for this use. When this is
176  /// non-empty, AM.HasBaseReg should be set to true.
177  SmallVector<const SCEV *, 2> BaseRegs;
178
179  /// ScaledReg - The 'scaled' register for this use. This should be non-null
180  /// when AM.Scale is not zero.
181  const SCEV *ScaledReg;
182
183  Formula() : ScaledReg(0) {}
184
185  void InitialMatch(const SCEV *S, Loop *L,
186                    ScalarEvolution &SE, DominatorTree &DT);
187
188  unsigned getNumRegs() const;
189  const Type *getType() const;
190
191  bool referencesReg(const SCEV *S) const;
192  bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
193                                  const RegUseTracker &RegUses) const;
194
195  void print(raw_ostream &OS) const;
196  void dump() const;
197};
198
199}
200
201/// DoInitialMatch - Recursion helper for InitialMatch.
202static void DoInitialMatch(const SCEV *S, Loop *L,
203                           SmallVectorImpl<const SCEV *> &Good,
204                           SmallVectorImpl<const SCEV *> &Bad,
205                           ScalarEvolution &SE, DominatorTree &DT) {
206  // Collect expressions which properly dominate the loop header.
207  if (S->properlyDominates(L->getHeader(), &DT)) {
208    Good.push_back(S);
209    return;
210  }
211
212  // Look at add operands.
213  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
214    for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
215         I != E; ++I)
216      DoInitialMatch(*I, L, Good, Bad, SE, DT);
217    return;
218  }
219
220  // Look at addrec operands.
221  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
222    if (!AR->getStart()->isZero()) {
223      DoInitialMatch(AR->getStart(), L, Good, Bad, SE, DT);
224      DoInitialMatch(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()),
225                                      AR->getStepRecurrence(SE),
226                                      AR->getLoop()),
227                     L, Good, Bad, SE, DT);
228      return;
229    }
230
231  // Handle a multiplication by -1 (negation) if it didn't fold.
232  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
233    if (Mul->getOperand(0)->isAllOnesValue()) {
234      SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
235      const SCEV *NewMul = SE.getMulExpr(Ops);
236
237      SmallVector<const SCEV *, 4> MyGood;
238      SmallVector<const SCEV *, 4> MyBad;
239      DoInitialMatch(NewMul, L, MyGood, MyBad, SE, DT);
240      const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
241        SE.getEffectiveSCEVType(NewMul->getType())));
242      for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(),
243           E = MyGood.end(); I != E; ++I)
244        Good.push_back(SE.getMulExpr(NegOne, *I));
245      for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(),
246           E = MyBad.end(); I != E; ++I)
247        Bad.push_back(SE.getMulExpr(NegOne, *I));
248      return;
249    }
250
251  // Ok, we can't do anything interesting. Just stuff the whole thing into a
252  // register and hope for the best.
253  Bad.push_back(S);
254}
255
256/// InitialMatch - Incorporate loop-variant parts of S into this Formula,
257/// attempting to keep all loop-invariant and loop-computable values in a
258/// single base register.
259void Formula::InitialMatch(const SCEV *S, Loop *L,
260                           ScalarEvolution &SE, DominatorTree &DT) {
261  SmallVector<const SCEV *, 4> Good;
262  SmallVector<const SCEV *, 4> Bad;
263  DoInitialMatch(S, L, Good, Bad, SE, DT);
264  if (!Good.empty()) {
265    BaseRegs.push_back(SE.getAddExpr(Good));
266    AM.HasBaseReg = true;
267  }
268  if (!Bad.empty()) {
269    BaseRegs.push_back(SE.getAddExpr(Bad));
270    AM.HasBaseReg = true;
271  }
272}
273
274/// getNumRegs - Return the total number of register operands used by this
275/// formula. This does not include register uses implied by non-constant
276/// addrec strides.
277unsigned Formula::getNumRegs() const {
278  return !!ScaledReg + BaseRegs.size();
279}
280
281/// getType - Return the type of this formula, if it has one, or null
282/// otherwise. This type is meaningless except for the bit size.
283const Type *Formula::getType() const {
284  return !BaseRegs.empty() ? BaseRegs.front()->getType() :
285         ScaledReg ? ScaledReg->getType() :
286         AM.BaseGV ? AM.BaseGV->getType() :
287         0;
288}
289
290/// referencesReg - Test if this formula references the given register.
291bool Formula::referencesReg(const SCEV *S) const {
292  return S == ScaledReg ||
293         std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
294}
295
296/// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers
297/// which are used by uses other than the use with the given index.
298bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
299                                         const RegUseTracker &RegUses) const {
300  if (ScaledReg)
301    if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
302      return true;
303  for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
304       E = BaseRegs.end(); I != E; ++I)
305    if (RegUses.isRegUsedByUsesOtherThan(*I, LUIdx))
306      return true;
307  return false;
308}
309
310void Formula::print(raw_ostream &OS) const {
311  bool First = true;
312  if (AM.BaseGV) {
313    if (!First) OS << " + "; else First = false;
314    WriteAsOperand(OS, AM.BaseGV, /*PrintType=*/false);
315  }
316  if (AM.BaseOffs != 0) {
317    if (!First) OS << " + "; else First = false;
318    OS << AM.BaseOffs;
319  }
320  for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
321       E = BaseRegs.end(); I != E; ++I) {
322    if (!First) OS << " + "; else First = false;
323    OS << "reg(" << **I << ')';
324  }
325  if (AM.Scale != 0) {
326    if (!First) OS << " + "; else First = false;
327    OS << AM.Scale << "*reg(";
328    if (ScaledReg)
329      OS << *ScaledReg;
330    else
331      OS << "<unknown>";
332    OS << ')';
333  }
334}
335
336void Formula::dump() const {
337  print(errs()); errs() << '\n';
338}
339
340/// isAddRecSExtable - Return true if the given addrec can be sign-extended
341/// without changing its value.
342static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
343  const Type *WideTy =
344    IntegerType::get(SE.getContext(),
345                     SE.getTypeSizeInBits(AR->getType()) + 1);
346  return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
347}
348
349/// isAddSExtable - Return true if the given add can be sign-extended
350/// without changing its value.
351static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
352  const Type *WideTy =
353    IntegerType::get(SE.getContext(),
354                     SE.getTypeSizeInBits(A->getType()) + 1);
355  return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
356}
357
358/// isMulSExtable - Return true if the given add can be sign-extended
359/// without changing its value.
360static bool isMulSExtable(const SCEVMulExpr *A, ScalarEvolution &SE) {
361  const Type *WideTy =
362    IntegerType::get(SE.getContext(),
363                     SE.getTypeSizeInBits(A->getType()) + 1);
364  return isa<SCEVMulExpr>(SE.getSignExtendExpr(A, WideTy));
365}
366
367/// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined
368/// and if the remainder is known to be zero,  or null otherwise. If
369/// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified
370/// to Y, ignoring that the multiplication may overflow, which is useful when
371/// the result will be used in a context where the most significant bits are
372/// ignored.
373static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
374                                ScalarEvolution &SE,
375                                bool IgnoreSignificantBits = false) {
376  // Handle the trivial case, which works for any SCEV type.
377  if (LHS == RHS)
378    return SE.getIntegerSCEV(1, LHS->getType());
379
380  // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do some
381  // folding.
382  if (RHS->isAllOnesValue())
383    return SE.getMulExpr(LHS, RHS);
384
385  // Check for a division of a constant by a constant.
386  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
387    const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
388    if (!RC)
389      return 0;
390    if (C->getValue()->getValue().srem(RC->getValue()->getValue()) != 0)
391      return 0;
392    return SE.getConstant(C->getValue()->getValue()
393               .sdiv(RC->getValue()->getValue()));
394  }
395
396  // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
397  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
398    if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) {
399      const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
400                                       IgnoreSignificantBits);
401      if (!Start) return 0;
402      const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
403                                      IgnoreSignificantBits);
404      if (!Step) return 0;
405      return SE.getAddRecExpr(Start, Step, AR->getLoop());
406    }
407  }
408
409  // Distribute the sdiv over add operands, if the add doesn't overflow.
410  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
411    if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
412      SmallVector<const SCEV *, 8> Ops;
413      for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
414           I != E; ++I) {
415        const SCEV *Op = getExactSDiv(*I, RHS, SE,
416                                      IgnoreSignificantBits);
417        if (!Op) return 0;
418        Ops.push_back(Op);
419      }
420      return SE.getAddExpr(Ops);
421    }
422  }
423
424  // Check for a multiply operand that we can pull RHS out of.
425  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS))
426    if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
427      SmallVector<const SCEV *, 4> Ops;
428      bool Found = false;
429      for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end();
430           I != E; ++I) {
431        if (!Found)
432          if (const SCEV *Q = getExactSDiv(*I, RHS, SE,
433                                           IgnoreSignificantBits)) {
434            Ops.push_back(Q);
435            Found = true;
436            continue;
437          }
438        Ops.push_back(*I);
439      }
440      return Found ? SE.getMulExpr(Ops) : 0;
441    }
442
443  // Otherwise we don't know.
444  return 0;
445}
446
447/// ExtractImmediate - If S involves the addition of a constant integer value,
448/// return that integer value, and mutate S to point to a new SCEV with that
449/// value excluded.
450static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
451  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
452    if (C->getValue()->getValue().getMinSignedBits() <= 64) {
453      S = SE.getIntegerSCEV(0, C->getType());
454      return C->getValue()->getSExtValue();
455    }
456  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
457    SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
458    int64_t Result = ExtractImmediate(NewOps.front(), SE);
459    S = SE.getAddExpr(NewOps);
460    return Result;
461  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
462    SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
463    int64_t Result = ExtractImmediate(NewOps.front(), SE);
464    S = SE.getAddRecExpr(NewOps, AR->getLoop());
465    return Result;
466  }
467  return 0;
468}
469
470/// ExtractSymbol - If S involves the addition of a GlobalValue address,
471/// return that symbol, and mutate S to point to a new SCEV with that
472/// value excluded.
473static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
474  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
475    if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
476      S = SE.getIntegerSCEV(0, GV->getType());
477      return GV;
478    }
479  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
480    SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
481    GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
482    S = SE.getAddExpr(NewOps);
483    return Result;
484  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
485    SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
486    GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
487    S = SE.getAddRecExpr(NewOps, AR->getLoop());
488    return Result;
489  }
490  return 0;
491}
492
493/// isAddressUse - Returns true if the specified instruction is using the
494/// specified value as an address.
495static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
496  bool isAddress = isa<LoadInst>(Inst);
497  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
498    if (SI->getOperand(1) == OperandVal)
499      isAddress = true;
500  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
501    // Addressing modes can also be folded into prefetches and a variety
502    // of intrinsics.
503    switch (II->getIntrinsicID()) {
504      default: break;
505      case Intrinsic::prefetch:
506      case Intrinsic::x86_sse2_loadu_dq:
507      case Intrinsic::x86_sse2_loadu_pd:
508      case Intrinsic::x86_sse_loadu_ps:
509      case Intrinsic::x86_sse_storeu_ps:
510      case Intrinsic::x86_sse2_storeu_pd:
511      case Intrinsic::x86_sse2_storeu_dq:
512      case Intrinsic::x86_sse2_storel_dq:
513        if (II->getOperand(1) == OperandVal)
514          isAddress = true;
515        break;
516    }
517  }
518  return isAddress;
519}
520
521/// getAccessType - Return the type of the memory being accessed.
522static const Type *getAccessType(const Instruction *Inst) {
523  const Type *AccessTy = Inst->getType();
524  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
525    AccessTy = SI->getOperand(0)->getType();
526  else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
527    // Addressing modes can also be folded into prefetches and a variety
528    // of intrinsics.
529    switch (II->getIntrinsicID()) {
530    default: break;
531    case Intrinsic::x86_sse_storeu_ps:
532    case Intrinsic::x86_sse2_storeu_pd:
533    case Intrinsic::x86_sse2_storeu_dq:
534    case Intrinsic::x86_sse2_storel_dq:
535      AccessTy = II->getOperand(1)->getType();
536      break;
537    }
538  }
539
540  // All pointers have the same requirements, so canonicalize them to an
541  // arbitrary pointer type to minimize variation.
542  if (const PointerType *PTy = dyn_cast<PointerType>(AccessTy))
543    AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
544                                PTy->getAddressSpace());
545
546  return AccessTy;
547}
548
549/// DeleteTriviallyDeadInstructions - If any of the instructions is the
550/// specified set are trivially dead, delete them and see if this makes any of
551/// their operands subsequently dead.
552static bool
553DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
554  bool Changed = false;
555
556  while (!DeadInsts.empty()) {
557    Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
558
559    if (I == 0 || !isInstructionTriviallyDead(I))
560      continue;
561
562    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
563      if (Instruction *U = dyn_cast<Instruction>(*OI)) {
564        *OI = 0;
565        if (U->use_empty())
566          DeadInsts.push_back(U);
567      }
568
569    I->eraseFromParent();
570    Changed = true;
571  }
572
573  return Changed;
574}
575
576namespace {
577
578/// Cost - This class is used to measure and compare candidate formulae.
579class Cost {
580  /// TODO: Some of these could be merged. Also, a lexical ordering
581  /// isn't always optimal.
582  unsigned NumRegs;
583  unsigned AddRecCost;
584  unsigned NumIVMuls;
585  unsigned NumBaseAdds;
586  unsigned ImmCost;
587  unsigned SetupCost;
588
589public:
590  Cost()
591    : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
592      SetupCost(0) {}
593
594  unsigned getNumRegs() const { return NumRegs; }
595
596  bool operator<(const Cost &Other) const;
597
598  void Loose();
599
600  void RateFormula(const Formula &F,
601                   SmallPtrSet<const SCEV *, 16> &Regs,
602                   const DenseSet<const SCEV *> &VisitedRegs,
603                   const Loop *L,
604                   const SmallVectorImpl<int64_t> &Offsets,
605                   ScalarEvolution &SE, DominatorTree &DT);
606
607  void print(raw_ostream &OS) const;
608  void dump() const;
609
610private:
611  void RateRegister(const SCEV *Reg,
612                    SmallPtrSet<const SCEV *, 16> &Regs,
613                    const Loop *L,
614                    ScalarEvolution &SE, DominatorTree &DT);
615  void RatePrimaryRegister(const SCEV *Reg,
616                           SmallPtrSet<const SCEV *, 16> &Regs,
617                           const Loop *L,
618                           ScalarEvolution &SE, DominatorTree &DT);
619};
620
621}
622
623/// RateRegister - Tally up interesting quantities from the given register.
624void Cost::RateRegister(const SCEV *Reg,
625                        SmallPtrSet<const SCEV *, 16> &Regs,
626                        const Loop *L,
627                        ScalarEvolution &SE, DominatorTree &DT) {
628  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
629    if (AR->getLoop() == L)
630      AddRecCost += 1; /// TODO: This should be a function of the stride.
631
632    // If this is an addrec for a loop that's already been visited by LSR,
633    // don't second-guess its addrec phi nodes. LSR isn't currently smart
634    // enough to reason about more than one loop at a time. Consider these
635    // registers free and leave them alone.
636    else if (L->contains(AR->getLoop()) ||
637             (!AR->getLoop()->contains(L) &&
638              DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) {
639      for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
640           PHINode *PN = dyn_cast<PHINode>(I); ++I)
641        if (SE.isSCEVable(PN->getType()) &&
642            (SE.getEffectiveSCEVType(PN->getType()) ==
643             SE.getEffectiveSCEVType(AR->getType())) &&
644            SE.getSCEV(PN) == AR)
645          return;
646
647      // If this isn't one of the addrecs that the loop already has, it
648      // would require a costly new phi and add. TODO: This isn't
649      // precisely modeled right now.
650      ++NumBaseAdds;
651      if (!Regs.count(AR->getStart()))
652        RateRegister(AR->getStart(), Regs, L, SE, DT);
653    }
654
655    // Add the step value register, if it needs one.
656    // TODO: The non-affine case isn't precisely modeled here.
657    if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1)))
658      if (!Regs.count(AR->getStart()))
659        RateRegister(AR->getOperand(1), Regs, L, SE, DT);
660  }
661  ++NumRegs;
662
663  // Rough heuristic; favor registers which don't require extra setup
664  // instructions in the preheader.
665  if (!isa<SCEVUnknown>(Reg) &&
666      !isa<SCEVConstant>(Reg) &&
667      !(isa<SCEVAddRecExpr>(Reg) &&
668        (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
669         isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
670    ++SetupCost;
671}
672
673/// RatePrimaryRegister - Record this register in the set. If we haven't seen it
674/// before, rate it.
675void Cost::RatePrimaryRegister(const SCEV *Reg,
676                               SmallPtrSet<const SCEV *, 16> &Regs,
677                               const Loop *L,
678                               ScalarEvolution &SE, DominatorTree &DT) {
679  if (Regs.insert(Reg))
680    RateRegister(Reg, Regs, L, SE, DT);
681}
682
683void Cost::RateFormula(const Formula &F,
684                       SmallPtrSet<const SCEV *, 16> &Regs,
685                       const DenseSet<const SCEV *> &VisitedRegs,
686                       const Loop *L,
687                       const SmallVectorImpl<int64_t> &Offsets,
688                       ScalarEvolution &SE, DominatorTree &DT) {
689  // Tally up the registers.
690  if (const SCEV *ScaledReg = F.ScaledReg) {
691    if (VisitedRegs.count(ScaledReg)) {
692      Loose();
693      return;
694    }
695    RatePrimaryRegister(ScaledReg, Regs, L, SE, DT);
696  }
697  for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
698       E = F.BaseRegs.end(); I != E; ++I) {
699    const SCEV *BaseReg = *I;
700    if (VisitedRegs.count(BaseReg)) {
701      Loose();
702      return;
703    }
704    RatePrimaryRegister(BaseReg, Regs, L, SE, DT);
705
706    NumIVMuls += isa<SCEVMulExpr>(BaseReg) &&
707                 BaseReg->hasComputableLoopEvolution(L);
708  }
709
710  if (F.BaseRegs.size() > 1)
711    NumBaseAdds += F.BaseRegs.size() - 1;
712
713  // Tally up the non-zero immediates.
714  for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
715       E = Offsets.end(); I != E; ++I) {
716    int64_t Offset = (uint64_t)*I + F.AM.BaseOffs;
717    if (F.AM.BaseGV)
718      ImmCost += 64; // Handle symbolic values conservatively.
719                     // TODO: This should probably be the pointer size.
720    else if (Offset != 0)
721      ImmCost += APInt(64, Offset, true).getMinSignedBits();
722  }
723}
724
725/// Loose - Set this cost to a loosing value.
726void Cost::Loose() {
727  NumRegs = ~0u;
728  AddRecCost = ~0u;
729  NumIVMuls = ~0u;
730  NumBaseAdds = ~0u;
731  ImmCost = ~0u;
732  SetupCost = ~0u;
733}
734
735/// operator< - Choose the lower cost.
736bool Cost::operator<(const Cost &Other) const {
737  if (NumRegs != Other.NumRegs)
738    return NumRegs < Other.NumRegs;
739  if (AddRecCost != Other.AddRecCost)
740    return AddRecCost < Other.AddRecCost;
741  if (NumIVMuls != Other.NumIVMuls)
742    return NumIVMuls < Other.NumIVMuls;
743  if (NumBaseAdds != Other.NumBaseAdds)
744    return NumBaseAdds < Other.NumBaseAdds;
745  if (ImmCost != Other.ImmCost)
746    return ImmCost < Other.ImmCost;
747  if (SetupCost != Other.SetupCost)
748    return SetupCost < Other.SetupCost;
749  return false;
750}
751
752void Cost::print(raw_ostream &OS) const {
753  OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
754  if (AddRecCost != 0)
755    OS << ", with addrec cost " << AddRecCost;
756  if (NumIVMuls != 0)
757    OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
758  if (NumBaseAdds != 0)
759    OS << ", plus " << NumBaseAdds << " base add"
760       << (NumBaseAdds == 1 ? "" : "s");
761  if (ImmCost != 0)
762    OS << ", plus " << ImmCost << " imm cost";
763  if (SetupCost != 0)
764    OS << ", plus " << SetupCost << " setup cost";
765}
766
767void Cost::dump() const {
768  print(errs()); errs() << '\n';
769}
770
771namespace {
772
773/// LSRFixup - An operand value in an instruction which is to be replaced
774/// with some equivalent, possibly strength-reduced, replacement.
775struct LSRFixup {
776  /// UserInst - The instruction which will be updated.
777  Instruction *UserInst;
778
779  /// OperandValToReplace - The operand of the instruction which will
780  /// be replaced. The operand may be used more than once; every instance
781  /// will be replaced.
782  Value *OperandValToReplace;
783
784  /// PostIncLoops - If this user is to use the post-incremented value of an
785  /// induction variable, this variable is non-null and holds the loop
786  /// associated with the induction variable.
787  PostIncLoopSet PostIncLoops;
788
789  /// LUIdx - The index of the LSRUse describing the expression which
790  /// this fixup needs, minus an offset (below).
791  size_t LUIdx;
792
793  /// Offset - A constant offset to be added to the LSRUse expression.
794  /// This allows multiple fixups to share the same LSRUse with different
795  /// offsets, for example in an unrolled loop.
796  int64_t Offset;
797
798  bool isUseFullyOutsideLoop(const Loop *L) const;
799
800  LSRFixup();
801
802  void print(raw_ostream &OS) const;
803  void dump() const;
804};
805
806}
807
808LSRFixup::LSRFixup()
809  : UserInst(0), OperandValToReplace(0),
810    LUIdx(~size_t(0)), Offset(0) {}
811
812/// isUseFullyOutsideLoop - Test whether this fixup always uses its
813/// value outside of the given loop.
814bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
815  // PHI nodes use their value in their incoming blocks.
816  if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
817    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
818      if (PN->getIncomingValue(i) == OperandValToReplace &&
819          L->contains(PN->getIncomingBlock(i)))
820        return false;
821    return true;
822  }
823
824  return !L->contains(UserInst);
825}
826
827void LSRFixup::print(raw_ostream &OS) const {
828  OS << "UserInst=";
829  // Store is common and interesting enough to be worth special-casing.
830  if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
831    OS << "store ";
832    WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false);
833  } else if (UserInst->getType()->isVoidTy())
834    OS << UserInst->getOpcodeName();
835  else
836    WriteAsOperand(OS, UserInst, /*PrintType=*/false);
837
838  OS << ", OperandValToReplace=";
839  WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false);
840
841  for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(),
842       E = PostIncLoops.end(); I != E; ++I) {
843    OS << ", PostIncLoop=";
844    WriteAsOperand(OS, (*I)->getHeader(), /*PrintType=*/false);
845  }
846
847  if (LUIdx != ~size_t(0))
848    OS << ", LUIdx=" << LUIdx;
849
850  if (Offset != 0)
851    OS << ", Offset=" << Offset;
852}
853
854void LSRFixup::dump() const {
855  print(errs()); errs() << '\n';
856}
857
858namespace {
859
860/// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding
861/// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*.
862struct UniquifierDenseMapInfo {
863  static SmallVector<const SCEV *, 2> getEmptyKey() {
864    SmallVector<const SCEV *, 2> V;
865    V.push_back(reinterpret_cast<const SCEV *>(-1));
866    return V;
867  }
868
869  static SmallVector<const SCEV *, 2> getTombstoneKey() {
870    SmallVector<const SCEV *, 2> V;
871    V.push_back(reinterpret_cast<const SCEV *>(-2));
872    return V;
873  }
874
875  static unsigned getHashValue(const SmallVector<const SCEV *, 2> &V) {
876    unsigned Result = 0;
877    for (SmallVectorImpl<const SCEV *>::const_iterator I = V.begin(),
878         E = V.end(); I != E; ++I)
879      Result ^= DenseMapInfo<const SCEV *>::getHashValue(*I);
880    return Result;
881  }
882
883  static bool isEqual(const SmallVector<const SCEV *, 2> &LHS,
884                      const SmallVector<const SCEV *, 2> &RHS) {
885    return LHS == RHS;
886  }
887};
888
889/// LSRUse - This class holds the state that LSR keeps for each use in
890/// IVUsers, as well as uses invented by LSR itself. It includes information
891/// about what kinds of things can be folded into the user, information about
892/// the user itself, and information about how the use may be satisfied.
893/// TODO: Represent multiple users of the same expression in common?
894class LSRUse {
895  DenseSet<SmallVector<const SCEV *, 2>, UniquifierDenseMapInfo> Uniquifier;
896
897public:
898  /// KindType - An enum for a kind of use, indicating what types of
899  /// scaled and immediate operands it might support.
900  enum KindType {
901    Basic,   ///< A normal use, with no folding.
902    Special, ///< A special case of basic, allowing -1 scales.
903    Address, ///< An address use; folding according to TargetLowering
904    ICmpZero ///< An equality icmp with both operands folded into one.
905    // TODO: Add a generic icmp too?
906  };
907
908  KindType Kind;
909  const Type *AccessTy;
910
911  SmallVector<int64_t, 8> Offsets;
912  int64_t MinOffset;
913  int64_t MaxOffset;
914
915  /// AllFixupsOutsideLoop - This records whether all of the fixups using this
916  /// LSRUse are outside of the loop, in which case some special-case heuristics
917  /// may be used.
918  bool AllFixupsOutsideLoop;
919
920  /// Formulae - A list of ways to build a value that can satisfy this user.
921  /// After the list is populated, one of these is selected heuristically and
922  /// used to formulate a replacement for OperandValToReplace in UserInst.
923  SmallVector<Formula, 12> Formulae;
924
925  /// Regs - The set of register candidates used by all formulae in this LSRUse.
926  SmallPtrSet<const SCEV *, 4> Regs;
927
928  LSRUse(KindType K, const Type *T) : Kind(K), AccessTy(T),
929                                      MinOffset(INT64_MAX),
930                                      MaxOffset(INT64_MIN),
931                                      AllFixupsOutsideLoop(true) {}
932
933  bool InsertFormula(const Formula &F);
934
935  void check() const;
936
937  void print(raw_ostream &OS) const;
938  void dump() const;
939};
940
941/// InsertFormula - If the given formula has not yet been inserted, add it to
942/// the list, and return true. Return false otherwise.
943bool LSRUse::InsertFormula(const Formula &F) {
944  SmallVector<const SCEV *, 2> Key = F.BaseRegs;
945  if (F.ScaledReg) Key.push_back(F.ScaledReg);
946  // Unstable sort by host order ok, because this is only used for uniquifying.
947  std::sort(Key.begin(), Key.end());
948
949  if (!Uniquifier.insert(Key).second)
950    return false;
951
952  // Using a register to hold the value of 0 is not profitable.
953  assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
954         "Zero allocated in a scaled register!");
955#ifndef NDEBUG
956  for (SmallVectorImpl<const SCEV *>::const_iterator I =
957       F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I)
958    assert(!(*I)->isZero() && "Zero allocated in a base register!");
959#endif
960
961  // Add the formula to the list.
962  Formulae.push_back(F);
963
964  // Record registers now being used by this use.
965  if (F.ScaledReg) Regs.insert(F.ScaledReg);
966  Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
967
968  return true;
969}
970
971void LSRUse::print(raw_ostream &OS) const {
972  OS << "LSR Use: Kind=";
973  switch (Kind) {
974  case Basic:    OS << "Basic"; break;
975  case Special:  OS << "Special"; break;
976  case ICmpZero: OS << "ICmpZero"; break;
977  case Address:
978    OS << "Address of ";
979    if (AccessTy->isPointerTy())
980      OS << "pointer"; // the full pointer type could be really verbose
981    else
982      OS << *AccessTy;
983  }
984
985  OS << ", Offsets={";
986  for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
987       E = Offsets.end(); I != E; ++I) {
988    OS << *I;
989    if (next(I) != E)
990      OS << ',';
991  }
992  OS << '}';
993
994  if (AllFixupsOutsideLoop)
995    OS << ", all-fixups-outside-loop";
996}
997
998void LSRUse::dump() const {
999  print(errs()); errs() << '\n';
1000}
1001
1002/// isLegalUse - Test whether the use described by AM is "legal", meaning it can
1003/// be completely folded into the user instruction at isel time. This includes
1004/// address-mode folding and special icmp tricks.
1005static bool isLegalUse(const TargetLowering::AddrMode &AM,
1006                       LSRUse::KindType Kind, const Type *AccessTy,
1007                       const TargetLowering *TLI) {
1008  switch (Kind) {
1009  case LSRUse::Address:
1010    // If we have low-level target information, ask the target if it can
1011    // completely fold this address.
1012    if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy);
1013
1014    // Otherwise, just guess that reg+reg addressing is legal.
1015    return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1;
1016
1017  case LSRUse::ICmpZero:
1018    // There's not even a target hook for querying whether it would be legal to
1019    // fold a GV into an ICmp.
1020    if (AM.BaseGV)
1021      return false;
1022
1023    // ICmp only has two operands; don't allow more than two non-trivial parts.
1024    if (AM.Scale != 0 && AM.HasBaseReg && AM.BaseOffs != 0)
1025      return false;
1026
1027    // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1028    // putting the scaled register in the other operand of the icmp.
1029    if (AM.Scale != 0 && AM.Scale != -1)
1030      return false;
1031
1032    // If we have low-level target information, ask the target if it can fold an
1033    // integer immediate on an icmp.
1034    if (AM.BaseOffs != 0) {
1035      if (TLI) return TLI->isLegalICmpImmediate(-AM.BaseOffs);
1036      return false;
1037    }
1038
1039    return true;
1040
1041  case LSRUse::Basic:
1042    // Only handle single-register values.
1043    return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0;
1044
1045  case LSRUse::Special:
1046    // Only handle -1 scales, or no scale.
1047    return AM.Scale == 0 || AM.Scale == -1;
1048  }
1049
1050  return false;
1051}
1052
1053static bool isLegalUse(TargetLowering::AddrMode AM,
1054                       int64_t MinOffset, int64_t MaxOffset,
1055                       LSRUse::KindType Kind, const Type *AccessTy,
1056                       const TargetLowering *TLI) {
1057  // Check for overflow.
1058  if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) !=
1059      (MinOffset > 0))
1060    return false;
1061  AM.BaseOffs = (uint64_t)AM.BaseOffs + MinOffset;
1062  if (isLegalUse(AM, Kind, AccessTy, TLI)) {
1063    AM.BaseOffs = (uint64_t)AM.BaseOffs - MinOffset;
1064    // Check for overflow.
1065    if (((int64_t)((uint64_t)AM.BaseOffs + MaxOffset) > AM.BaseOffs) !=
1066        (MaxOffset > 0))
1067      return false;
1068    AM.BaseOffs = (uint64_t)AM.BaseOffs + MaxOffset;
1069    return isLegalUse(AM, Kind, AccessTy, TLI);
1070  }
1071  return false;
1072}
1073
1074static bool isAlwaysFoldable(int64_t BaseOffs,
1075                             GlobalValue *BaseGV,
1076                             bool HasBaseReg,
1077                             LSRUse::KindType Kind, const Type *AccessTy,
1078                             const TargetLowering *TLI) {
1079  // Fast-path: zero is always foldable.
1080  if (BaseOffs == 0 && !BaseGV) return true;
1081
1082  // Conservatively, create an address with an immediate and a
1083  // base and a scale.
1084  TargetLowering::AddrMode AM;
1085  AM.BaseOffs = BaseOffs;
1086  AM.BaseGV = BaseGV;
1087  AM.HasBaseReg = HasBaseReg;
1088  AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1089
1090  return isLegalUse(AM, Kind, AccessTy, TLI);
1091}
1092
1093static bool isAlwaysFoldable(const SCEV *S,
1094                             int64_t MinOffset, int64_t MaxOffset,
1095                             bool HasBaseReg,
1096                             LSRUse::KindType Kind, const Type *AccessTy,
1097                             const TargetLowering *TLI,
1098                             ScalarEvolution &SE) {
1099  // Fast-path: zero is always foldable.
1100  if (S->isZero()) return true;
1101
1102  // Conservatively, create an address with an immediate and a
1103  // base and a scale.
1104  int64_t BaseOffs = ExtractImmediate(S, SE);
1105  GlobalValue *BaseGV = ExtractSymbol(S, SE);
1106
1107  // If there's anything else involved, it's not foldable.
1108  if (!S->isZero()) return false;
1109
1110  // Fast-path: zero is always foldable.
1111  if (BaseOffs == 0 && !BaseGV) return true;
1112
1113  // Conservatively, create an address with an immediate and a
1114  // base and a scale.
1115  TargetLowering::AddrMode AM;
1116  AM.BaseOffs = BaseOffs;
1117  AM.BaseGV = BaseGV;
1118  AM.HasBaseReg = HasBaseReg;
1119  AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1120
1121  return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI);
1122}
1123
1124/// FormulaSorter - This class implements an ordering for formulae which sorts
1125/// the by their standalone cost.
1126class FormulaSorter {
1127  /// These two sets are kept empty, so that we compute standalone costs.
1128  DenseSet<const SCEV *> VisitedRegs;
1129  SmallPtrSet<const SCEV *, 16> Regs;
1130  Loop *L;
1131  LSRUse *LU;
1132  ScalarEvolution &SE;
1133  DominatorTree &DT;
1134
1135public:
1136  FormulaSorter(Loop *l, LSRUse &lu, ScalarEvolution &se, DominatorTree &dt)
1137    : L(l), LU(&lu), SE(se), DT(dt) {}
1138
1139  bool operator()(const Formula &A, const Formula &B) {
1140    Cost CostA;
1141    CostA.RateFormula(A, Regs, VisitedRegs, L, LU->Offsets, SE, DT);
1142    Regs.clear();
1143    Cost CostB;
1144    CostB.RateFormula(B, Regs, VisitedRegs, L, LU->Offsets, SE, DT);
1145    Regs.clear();
1146    return CostA < CostB;
1147  }
1148};
1149
1150/// LSRInstance - This class holds state for the main loop strength reduction
1151/// logic.
1152class LSRInstance {
1153  IVUsers &IU;
1154  ScalarEvolution &SE;
1155  DominatorTree &DT;
1156  const TargetLowering *const TLI;
1157  Loop *const L;
1158  bool Changed;
1159
1160  /// IVIncInsertPos - This is the insert position that the current loop's
1161  /// induction variable increment should be placed. In simple loops, this is
1162  /// the latch block's terminator. But in more complicated cases, this is a
1163  /// position which will dominate all the in-loop post-increment users.
1164  Instruction *IVIncInsertPos;
1165
1166  /// Factors - Interesting factors between use strides.
1167  SmallSetVector<int64_t, 8> Factors;
1168
1169  /// Types - Interesting use types, to facilitate truncation reuse.
1170  SmallSetVector<const Type *, 4> Types;
1171
1172  /// Fixups - The list of operands which are to be replaced.
1173  SmallVector<LSRFixup, 16> Fixups;
1174
1175  /// Uses - The list of interesting uses.
1176  SmallVector<LSRUse, 16> Uses;
1177
1178  /// RegUses - Track which uses use which register candidates.
1179  RegUseTracker RegUses;
1180
1181  void OptimizeShadowIV();
1182  bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
1183  ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
1184  bool OptimizeLoopTermCond();
1185
1186  void CollectInterestingTypesAndFactors();
1187  void CollectFixupsAndInitialFormulae();
1188
1189  LSRFixup &getNewFixup() {
1190    Fixups.push_back(LSRFixup());
1191    return Fixups.back();
1192  }
1193
1194  // Support for sharing of LSRUses between LSRFixups.
1195  typedef DenseMap<const SCEV *, size_t> UseMapTy;
1196  UseMapTy UseMap;
1197
1198  bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
1199                          LSRUse::KindType Kind, const Type *AccessTy);
1200
1201  std::pair<size_t, int64_t> getUse(const SCEV *&Expr,
1202                                    LSRUse::KindType Kind,
1203                                    const Type *AccessTy);
1204
1205public:
1206  void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1207  void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1208  void CountRegisters(const Formula &F, size_t LUIdx);
1209  bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
1210
1211  void CollectLoopInvariantFixupsAndFormulae();
1212
1213  void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
1214                              unsigned Depth = 0);
1215  void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
1216  void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
1217  void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
1218  void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
1219  void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
1220  void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
1221  void GenerateCrossUseConstantOffsets();
1222  void GenerateAllReuseFormulae();
1223
1224  void FilterOutUndesirableDedicatedRegisters();
1225  void NarrowSearchSpaceUsingHeuristics();
1226
1227  void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
1228                    Cost &SolutionCost,
1229                    SmallVectorImpl<const Formula *> &Workspace,
1230                    const Cost &CurCost,
1231                    const SmallPtrSet<const SCEV *, 16> &CurRegs,
1232                    DenseSet<const SCEV *> &VisitedRegs) const;
1233  void Solve(SmallVectorImpl<const Formula *> &Solution) const;
1234
1235  Value *Expand(const LSRFixup &LF,
1236                const Formula &F,
1237                BasicBlock::iterator IP,
1238                SCEVExpander &Rewriter,
1239                SmallVectorImpl<WeakVH> &DeadInsts) const;
1240  void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
1241                     const Formula &F,
1242                     SCEVExpander &Rewriter,
1243                     SmallVectorImpl<WeakVH> &DeadInsts,
1244                     Pass *P) const;
1245  void Rewrite(const LSRFixup &LF,
1246               const Formula &F,
1247               SCEVExpander &Rewriter,
1248               SmallVectorImpl<WeakVH> &DeadInsts,
1249               Pass *P) const;
1250  void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
1251                         Pass *P);
1252
1253  LSRInstance(const TargetLowering *tli, Loop *l, Pass *P);
1254
1255  bool getChanged() const { return Changed; }
1256
1257  void print_factors_and_types(raw_ostream &OS) const;
1258  void print_fixups(raw_ostream &OS) const;
1259  void print_uses(raw_ostream &OS) const;
1260  void print(raw_ostream &OS) const;
1261  void dump() const;
1262};
1263
1264}
1265
1266/// OptimizeShadowIV - If IV is used in a int-to-float cast
1267/// inside the loop then try to eliminate the cast operation.
1268void LSRInstance::OptimizeShadowIV() {
1269  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1270  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1271    return;
1272
1273  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
1274       UI != E; /* empty */) {
1275    IVUsers::const_iterator CandidateUI = UI;
1276    ++UI;
1277    Instruction *ShadowUse = CandidateUI->getUser();
1278    const Type *DestTy = NULL;
1279
1280    /* If shadow use is a int->float cast then insert a second IV
1281       to eliminate this cast.
1282
1283         for (unsigned i = 0; i < n; ++i)
1284           foo((double)i);
1285
1286       is transformed into
1287
1288         double d = 0.0;
1289         for (unsigned i = 0; i < n; ++i, ++d)
1290           foo(d);
1291    */
1292    if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser()))
1293      DestTy = UCast->getDestTy();
1294    else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser()))
1295      DestTy = SCast->getDestTy();
1296    if (!DestTy) continue;
1297
1298    if (TLI) {
1299      // If target does not support DestTy natively then do not apply
1300      // this transformation.
1301      EVT DVT = TLI->getValueType(DestTy);
1302      if (!TLI->isTypeLegal(DVT)) continue;
1303    }
1304
1305    PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
1306    if (!PH) continue;
1307    if (PH->getNumIncomingValues() != 2) continue;
1308
1309    const Type *SrcTy = PH->getType();
1310    int Mantissa = DestTy->getFPMantissaWidth();
1311    if (Mantissa == -1) continue;
1312    if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
1313      continue;
1314
1315    unsigned Entry, Latch;
1316    if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
1317      Entry = 0;
1318      Latch = 1;
1319    } else {
1320      Entry = 1;
1321      Latch = 0;
1322    }
1323
1324    ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
1325    if (!Init) continue;
1326    Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue());
1327
1328    BinaryOperator *Incr =
1329      dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
1330    if (!Incr) continue;
1331    if (Incr->getOpcode() != Instruction::Add
1332        && Incr->getOpcode() != Instruction::Sub)
1333      continue;
1334
1335    /* Initialize new IV, double d = 0.0 in above example. */
1336    ConstantInt *C = NULL;
1337    if (Incr->getOperand(0) == PH)
1338      C = dyn_cast<ConstantInt>(Incr->getOperand(1));
1339    else if (Incr->getOperand(1) == PH)
1340      C = dyn_cast<ConstantInt>(Incr->getOperand(0));
1341    else
1342      continue;
1343
1344    if (!C) continue;
1345
1346    // Ignore negative constants, as the code below doesn't handle them
1347    // correctly. TODO: Remove this restriction.
1348    if (!C->getValue().isStrictlyPositive()) continue;
1349
1350    /* Add new PHINode. */
1351    PHINode *NewPH = PHINode::Create(DestTy, "IV.S.", PH);
1352
1353    /* create new increment. '++d' in above example. */
1354    Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
1355    BinaryOperator *NewIncr =
1356      BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
1357                               Instruction::FAdd : Instruction::FSub,
1358                             NewPH, CFP, "IV.S.next.", Incr);
1359
1360    NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
1361    NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
1362
1363    /* Remove cast operation */
1364    ShadowUse->replaceAllUsesWith(NewPH);
1365    ShadowUse->eraseFromParent();
1366    break;
1367  }
1368}
1369
1370/// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
1371/// set the IV user and stride information and return true, otherwise return
1372/// false.
1373bool LSRInstance::FindIVUserForCond(ICmpInst *Cond,
1374                                    IVStrideUse *&CondUse) {
1375  for (IVUsers::iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
1376    if (UI->getUser() == Cond) {
1377      // NOTE: we could handle setcc instructions with multiple uses here, but
1378      // InstCombine does it as well for simple uses, it's not clear that it
1379      // occurs enough in real life to handle.
1380      CondUse = UI;
1381      return true;
1382    }
1383  return false;
1384}
1385
1386/// OptimizeMax - Rewrite the loop's terminating condition if it uses
1387/// a max computation.
1388///
1389/// This is a narrow solution to a specific, but acute, problem. For loops
1390/// like this:
1391///
1392///   i = 0;
1393///   do {
1394///     p[i] = 0.0;
1395///   } while (++i < n);
1396///
1397/// the trip count isn't just 'n', because 'n' might not be positive. And
1398/// unfortunately this can come up even for loops where the user didn't use
1399/// a C do-while loop. For example, seemingly well-behaved top-test loops
1400/// will commonly be lowered like this:
1401//
1402///   if (n > 0) {
1403///     i = 0;
1404///     do {
1405///       p[i] = 0.0;
1406///     } while (++i < n);
1407///   }
1408///
1409/// and then it's possible for subsequent optimization to obscure the if
1410/// test in such a way that indvars can't find it.
1411///
1412/// When indvars can't find the if test in loops like this, it creates a
1413/// max expression, which allows it to give the loop a canonical
1414/// induction variable:
1415///
1416///   i = 0;
1417///   max = n < 1 ? 1 : n;
1418///   do {
1419///     p[i] = 0.0;
1420///   } while (++i != max);
1421///
1422/// Canonical induction variables are necessary because the loop passes
1423/// are designed around them. The most obvious example of this is the
1424/// LoopInfo analysis, which doesn't remember trip count values. It
1425/// expects to be able to rediscover the trip count each time it is
1426/// needed, and it does this using a simple analysis that only succeeds if
1427/// the loop has a canonical induction variable.
1428///
1429/// However, when it comes time to generate code, the maximum operation
1430/// can be quite costly, especially if it's inside of an outer loop.
1431///
1432/// This function solves this problem by detecting this type of loop and
1433/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
1434/// the instructions for the maximum computation.
1435///
1436ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
1437  // Check that the loop matches the pattern we're looking for.
1438  if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
1439      Cond->getPredicate() != CmpInst::ICMP_NE)
1440    return Cond;
1441
1442  SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
1443  if (!Sel || !Sel->hasOneUse()) return Cond;
1444
1445  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1446  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1447    return Cond;
1448  const SCEV *One = SE.getIntegerSCEV(1, BackedgeTakenCount->getType());
1449
1450  // Add one to the backedge-taken count to get the trip count.
1451  const SCEV *IterationCount = SE.getAddExpr(BackedgeTakenCount, One);
1452
1453  // Check for a max calculation that matches the pattern.
1454  if (!isa<SCEVSMaxExpr>(IterationCount) && !isa<SCEVUMaxExpr>(IterationCount))
1455    return Cond;
1456  const SCEVNAryExpr *Max = cast<SCEVNAryExpr>(IterationCount);
1457  if (Max != SE.getSCEV(Sel)) return Cond;
1458
1459  // To handle a max with more than two operands, this optimization would
1460  // require additional checking and setup.
1461  if (Max->getNumOperands() != 2)
1462    return Cond;
1463
1464  const SCEV *MaxLHS = Max->getOperand(0);
1465  const SCEV *MaxRHS = Max->getOperand(1);
1466  if (!MaxLHS || MaxLHS != One) return Cond;
1467  // Check the relevant induction variable for conformance to
1468  // the pattern.
1469  const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
1470  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
1471  if (!AR || !AR->isAffine() ||
1472      AR->getStart() != One ||
1473      AR->getStepRecurrence(SE) != One)
1474    return Cond;
1475
1476  assert(AR->getLoop() == L &&
1477         "Loop condition operand is an addrec in a different loop!");
1478
1479  // Check the right operand of the select, and remember it, as it will
1480  // be used in the new comparison instruction.
1481  Value *NewRHS = 0;
1482  if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
1483    NewRHS = Sel->getOperand(1);
1484  else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
1485    NewRHS = Sel->getOperand(2);
1486  if (!NewRHS) return Cond;
1487
1488  // Determine the new comparison opcode. It may be signed or unsigned,
1489  // and the original comparison may be either equality or inequality.
1490  CmpInst::Predicate Pred =
1491    isa<SCEVSMaxExpr>(Max) ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
1492  if (Cond->getPredicate() == CmpInst::ICMP_EQ)
1493    Pred = CmpInst::getInversePredicate(Pred);
1494
1495  // Ok, everything looks ok to change the condition into an SLT or SGE and
1496  // delete the max calculation.
1497  ICmpInst *NewCond =
1498    new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
1499
1500  // Delete the max calculation instructions.
1501  Cond->replaceAllUsesWith(NewCond);
1502  CondUse->setUser(NewCond);
1503  Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
1504  Cond->eraseFromParent();
1505  Sel->eraseFromParent();
1506  if (Cmp->use_empty())
1507    Cmp->eraseFromParent();
1508  return NewCond;
1509}
1510
1511/// OptimizeLoopTermCond - Change loop terminating condition to use the
1512/// postinc iv when possible.
1513bool
1514LSRInstance::OptimizeLoopTermCond() {
1515  SmallPtrSet<Instruction *, 4> PostIncs;
1516
1517  BasicBlock *LatchBlock = L->getLoopLatch();
1518  SmallVector<BasicBlock*, 8> ExitingBlocks;
1519  L->getExitingBlocks(ExitingBlocks);
1520
1521  for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
1522    BasicBlock *ExitingBlock = ExitingBlocks[i];
1523
1524    // Get the terminating condition for the loop if possible.  If we
1525    // can, we want to change it to use a post-incremented version of its
1526    // induction variable, to allow coalescing the live ranges for the IV into
1527    // one register value.
1528
1529    BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
1530    if (!TermBr)
1531      continue;
1532    // FIXME: Overly conservative, termination condition could be an 'or' etc..
1533    if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
1534      continue;
1535
1536    // Search IVUsesByStride to find Cond's IVUse if there is one.
1537    IVStrideUse *CondUse = 0;
1538    ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
1539    if (!FindIVUserForCond(Cond, CondUse))
1540      continue;
1541
1542    // If the trip count is computed in terms of a max (due to ScalarEvolution
1543    // being unable to find a sufficient guard, for example), change the loop
1544    // comparison to use SLT or ULT instead of NE.
1545    // One consequence of doing this now is that it disrupts the count-down
1546    // optimization. That's not always a bad thing though, because in such
1547    // cases it may still be worthwhile to avoid a max.
1548    Cond = OptimizeMax(Cond, CondUse);
1549
1550    // If this exiting block dominates the latch block, it may also use
1551    // the post-inc value if it won't be shared with other uses.
1552    // Check for dominance.
1553    if (!DT.dominates(ExitingBlock, LatchBlock))
1554      continue;
1555
1556    // Conservatively avoid trying to use the post-inc value in non-latch
1557    // exits if there may be pre-inc users in intervening blocks.
1558    if (LatchBlock != ExitingBlock)
1559      for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
1560        // Test if the use is reachable from the exiting block. This dominator
1561        // query is a conservative approximation of reachability.
1562        if (&*UI != CondUse &&
1563            !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
1564          // Conservatively assume there may be reuse if the quotient of their
1565          // strides could be a legal scale.
1566          const SCEV *A = CondUse->getStride(L);
1567          const SCEV *B = UI->getStride(L);
1568          if (!A || !B) continue;
1569          if (SE.getTypeSizeInBits(A->getType()) !=
1570              SE.getTypeSizeInBits(B->getType())) {
1571            if (SE.getTypeSizeInBits(A->getType()) >
1572                SE.getTypeSizeInBits(B->getType()))
1573              B = SE.getSignExtendExpr(B, A->getType());
1574            else
1575              A = SE.getSignExtendExpr(A, B->getType());
1576          }
1577          if (const SCEVConstant *D =
1578                dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
1579            // Stride of one or negative one can have reuse with non-addresses.
1580            if (D->getValue()->isOne() ||
1581                D->getValue()->isAllOnesValue())
1582              goto decline_post_inc;
1583            // Avoid weird situations.
1584            if (D->getValue()->getValue().getMinSignedBits() >= 64 ||
1585                D->getValue()->getValue().isMinSignedValue())
1586              goto decline_post_inc;
1587            // Without TLI, assume that any stride might be valid, and so any
1588            // use might be shared.
1589            if (!TLI)
1590              goto decline_post_inc;
1591            // Check for possible scaled-address reuse.
1592            const Type *AccessTy = getAccessType(UI->getUser());
1593            TargetLowering::AddrMode AM;
1594            AM.Scale = D->getValue()->getSExtValue();
1595            if (TLI->isLegalAddressingMode(AM, AccessTy))
1596              goto decline_post_inc;
1597            AM.Scale = -AM.Scale;
1598            if (TLI->isLegalAddressingMode(AM, AccessTy))
1599              goto decline_post_inc;
1600          }
1601        }
1602
1603    DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
1604                 << *Cond << '\n');
1605
1606    // It's possible for the setcc instruction to be anywhere in the loop, and
1607    // possible for it to have multiple users.  If it is not immediately before
1608    // the exiting block branch, move it.
1609    if (&*++BasicBlock::iterator(Cond) != TermBr) {
1610      if (Cond->hasOneUse()) {
1611        Cond->moveBefore(TermBr);
1612      } else {
1613        // Clone the terminating condition and insert into the loopend.
1614        ICmpInst *OldCond = Cond;
1615        Cond = cast<ICmpInst>(Cond->clone());
1616        Cond->setName(L->getHeader()->getName() + ".termcond");
1617        ExitingBlock->getInstList().insert(TermBr, Cond);
1618
1619        // Clone the IVUse, as the old use still exists!
1620        CondUse = &IU.AddUser(CondUse->getExpr(),
1621                              Cond, CondUse->getOperandValToReplace());
1622        TermBr->replaceUsesOfWith(OldCond, Cond);
1623      }
1624    }
1625
1626    // If we get to here, we know that we can transform the setcc instruction to
1627    // use the post-incremented version of the IV, allowing us to coalesce the
1628    // live ranges for the IV correctly.
1629    CondUse->transformToPostInc(L);
1630    Changed = true;
1631
1632    PostIncs.insert(Cond);
1633  decline_post_inc:;
1634  }
1635
1636  // Determine an insertion point for the loop induction variable increment. It
1637  // must dominate all the post-inc comparisons we just set up, and it must
1638  // dominate the loop latch edge.
1639  IVIncInsertPos = L->getLoopLatch()->getTerminator();
1640  for (SmallPtrSet<Instruction *, 4>::const_iterator I = PostIncs.begin(),
1641       E = PostIncs.end(); I != E; ++I) {
1642    BasicBlock *BB =
1643      DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
1644                                    (*I)->getParent());
1645    if (BB == (*I)->getParent())
1646      IVIncInsertPos = *I;
1647    else if (BB != IVIncInsertPos->getParent())
1648      IVIncInsertPos = BB->getTerminator();
1649  }
1650
1651  return Changed;
1652}
1653
1654bool
1655LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
1656                                LSRUse::KindType Kind, const Type *AccessTy) {
1657  int64_t NewMinOffset = LU.MinOffset;
1658  int64_t NewMaxOffset = LU.MaxOffset;
1659  const Type *NewAccessTy = AccessTy;
1660
1661  // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
1662  // something conservative, however this can pessimize in the case that one of
1663  // the uses will have all its uses outside the loop, for example.
1664  if (LU.Kind != Kind)
1665    return false;
1666  // Conservatively assume HasBaseReg is true for now.
1667  if (NewOffset < LU.MinOffset) {
1668    if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, /*HasBaseReg=*/true,
1669                          Kind, AccessTy, TLI))
1670      return false;
1671    NewMinOffset = NewOffset;
1672  } else if (NewOffset > LU.MaxOffset) {
1673    if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, /*HasBaseReg=*/true,
1674                          Kind, AccessTy, TLI))
1675      return false;
1676    NewMaxOffset = NewOffset;
1677  }
1678  // Check for a mismatched access type, and fall back conservatively as needed.
1679  if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
1680    NewAccessTy = Type::getVoidTy(AccessTy->getContext());
1681
1682  // Update the use.
1683  LU.MinOffset = NewMinOffset;
1684  LU.MaxOffset = NewMaxOffset;
1685  LU.AccessTy = NewAccessTy;
1686  if (NewOffset != LU.Offsets.back())
1687    LU.Offsets.push_back(NewOffset);
1688  return true;
1689}
1690
1691/// getUse - Return an LSRUse index and an offset value for a fixup which
1692/// needs the given expression, with the given kind and optional access type.
1693/// Either reuse an existing use or create a new one, as needed.
1694std::pair<size_t, int64_t>
1695LSRInstance::getUse(const SCEV *&Expr,
1696                    LSRUse::KindType Kind, const Type *AccessTy) {
1697  const SCEV *Copy = Expr;
1698  int64_t Offset = ExtractImmediate(Expr, SE);
1699
1700  // Basic uses can't accept any offset, for example.
1701  if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, TLI)) {
1702    Expr = Copy;
1703    Offset = 0;
1704  }
1705
1706  std::pair<UseMapTy::iterator, bool> P =
1707    UseMap.insert(std::make_pair(Expr, 0));
1708  if (!P.second) {
1709    // A use already existed with this base.
1710    size_t LUIdx = P.first->second;
1711    LSRUse &LU = Uses[LUIdx];
1712    if (reconcileNewOffset(LU, Offset, Kind, AccessTy))
1713      // Reuse this use.
1714      return std::make_pair(LUIdx, Offset);
1715  }
1716
1717  // Create a new use.
1718  size_t LUIdx = Uses.size();
1719  P.first->second = LUIdx;
1720  Uses.push_back(LSRUse(Kind, AccessTy));
1721  LSRUse &LU = Uses[LUIdx];
1722
1723  // We don't need to track redundant offsets, but we don't need to go out
1724  // of our way here to avoid them.
1725  if (LU.Offsets.empty() || Offset != LU.Offsets.back())
1726    LU.Offsets.push_back(Offset);
1727
1728  LU.MinOffset = Offset;
1729  LU.MaxOffset = Offset;
1730  return std::make_pair(LUIdx, Offset);
1731}
1732
1733void LSRInstance::CollectInterestingTypesAndFactors() {
1734  SmallSetVector<const SCEV *, 4> Strides;
1735
1736  // Collect interesting types and strides.
1737  SmallVector<const SCEV *, 4> Worklist;
1738  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
1739    const SCEV *Expr = UI->getExpr();
1740
1741    // Collect interesting types.
1742    Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
1743
1744    // Add strides for mentioned loops.
1745    Worklist.push_back(Expr);
1746    do {
1747      const SCEV *S = Worklist.pop_back_val();
1748      if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1749        Strides.insert(AR->getStepRecurrence(SE));
1750        Worklist.push_back(AR->getStart());
1751      } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1752        Worklist.insert(Worklist.end(), Add->op_begin(), Add->op_end());
1753      }
1754    } while (!Worklist.empty());
1755  }
1756
1757  // Compute interesting factors from the set of interesting strides.
1758  for (SmallSetVector<const SCEV *, 4>::const_iterator
1759       I = Strides.begin(), E = Strides.end(); I != E; ++I)
1760    for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
1761         next(I); NewStrideIter != E; ++NewStrideIter) {
1762      const SCEV *OldStride = *I;
1763      const SCEV *NewStride = *NewStrideIter;
1764
1765      if (SE.getTypeSizeInBits(OldStride->getType()) !=
1766          SE.getTypeSizeInBits(NewStride->getType())) {
1767        if (SE.getTypeSizeInBits(OldStride->getType()) >
1768            SE.getTypeSizeInBits(NewStride->getType()))
1769          NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
1770        else
1771          OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
1772      }
1773      if (const SCEVConstant *Factor =
1774            dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
1775                                                        SE, true))) {
1776        if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
1777          Factors.insert(Factor->getValue()->getValue().getSExtValue());
1778      } else if (const SCEVConstant *Factor =
1779                   dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
1780                                                               NewStride,
1781                                                               SE, true))) {
1782        if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
1783          Factors.insert(Factor->getValue()->getValue().getSExtValue());
1784      }
1785    }
1786
1787  // If all uses use the same type, don't bother looking for truncation-based
1788  // reuse.
1789  if (Types.size() == 1)
1790    Types.clear();
1791
1792  DEBUG(print_factors_and_types(dbgs()));
1793}
1794
1795void LSRInstance::CollectFixupsAndInitialFormulae() {
1796  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
1797    // Record the uses.
1798    LSRFixup &LF = getNewFixup();
1799    LF.UserInst = UI->getUser();
1800    LF.OperandValToReplace = UI->getOperandValToReplace();
1801    LF.PostIncLoops = UI->getPostIncLoops();
1802
1803    LSRUse::KindType Kind = LSRUse::Basic;
1804    const Type *AccessTy = 0;
1805    if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
1806      Kind = LSRUse::Address;
1807      AccessTy = getAccessType(LF.UserInst);
1808    }
1809
1810    const SCEV *S = UI->getExpr();
1811
1812    // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
1813    // (N - i == 0), and this allows (N - i) to be the expression that we work
1814    // with rather than just N or i, so we can consider the register
1815    // requirements for both N and i at the same time. Limiting this code to
1816    // equality icmps is not a problem because all interesting loops use
1817    // equality icmps, thanks to IndVarSimplify.
1818    if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst))
1819      if (CI->isEquality()) {
1820        // Swap the operands if needed to put the OperandValToReplace on the
1821        // left, for consistency.
1822        Value *NV = CI->getOperand(1);
1823        if (NV == LF.OperandValToReplace) {
1824          CI->setOperand(1, CI->getOperand(0));
1825          CI->setOperand(0, NV);
1826        }
1827
1828        // x == y  -->  x - y == 0
1829        const SCEV *N = SE.getSCEV(NV);
1830        if (N->isLoopInvariant(L)) {
1831          Kind = LSRUse::ICmpZero;
1832          S = SE.getMinusSCEV(N, S);
1833        }
1834
1835        // -1 and the negations of all interesting strides (except the negation
1836        // of -1) are now also interesting.
1837        for (size_t i = 0, e = Factors.size(); i != e; ++i)
1838          if (Factors[i] != -1)
1839            Factors.insert(-(uint64_t)Factors[i]);
1840        Factors.insert(-1);
1841      }
1842
1843    // Set up the initial formula for this use.
1844    std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
1845    LF.LUIdx = P.first;
1846    LF.Offset = P.second;
1847    LSRUse &LU = Uses[LF.LUIdx];
1848    LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
1849
1850    // If this is the first use of this LSRUse, give it a formula.
1851    if (LU.Formulae.empty()) {
1852      InsertInitialFormula(S, LU, LF.LUIdx);
1853      CountRegisters(LU.Formulae.back(), LF.LUIdx);
1854    }
1855  }
1856
1857  DEBUG(print_fixups(dbgs()));
1858}
1859
1860void
1861LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
1862  Formula F;
1863  F.InitialMatch(S, L, SE, DT);
1864  bool Inserted = InsertFormula(LU, LUIdx, F);
1865  assert(Inserted && "Initial formula already exists!"); (void)Inserted;
1866}
1867
1868void
1869LSRInstance::InsertSupplementalFormula(const SCEV *S,
1870                                       LSRUse &LU, size_t LUIdx) {
1871  Formula F;
1872  F.BaseRegs.push_back(S);
1873  F.AM.HasBaseReg = true;
1874  bool Inserted = InsertFormula(LU, LUIdx, F);
1875  assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
1876}
1877
1878/// CountRegisters - Note which registers are used by the given formula,
1879/// updating RegUses.
1880void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
1881  if (F.ScaledReg)
1882    RegUses.CountRegister(F.ScaledReg, LUIdx);
1883  for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
1884       E = F.BaseRegs.end(); I != E; ++I)
1885    RegUses.CountRegister(*I, LUIdx);
1886}
1887
1888/// InsertFormula - If the given formula has not yet been inserted, add it to
1889/// the list, and return true. Return false otherwise.
1890bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
1891  if (!LU.InsertFormula(F))
1892    return false;
1893
1894  CountRegisters(F, LUIdx);
1895  return true;
1896}
1897
1898/// CollectLoopInvariantFixupsAndFormulae - Check for other uses of
1899/// loop-invariant values which we're tracking. These other uses will pin these
1900/// values in registers, making them less profitable for elimination.
1901/// TODO: This currently misses non-constant addrec step registers.
1902/// TODO: Should this give more weight to users inside the loop?
1903void
1904LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
1905  SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
1906  SmallPtrSet<const SCEV *, 8> Inserted;
1907
1908  while (!Worklist.empty()) {
1909    const SCEV *S = Worklist.pop_back_val();
1910
1911    if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
1912      Worklist.insert(Worklist.end(), N->op_begin(), N->op_end());
1913    else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
1914      Worklist.push_back(C->getOperand());
1915    else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
1916      Worklist.push_back(D->getLHS());
1917      Worklist.push_back(D->getRHS());
1918    } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
1919      if (!Inserted.insert(U)) continue;
1920      const Value *V = U->getValue();
1921      if (const Instruction *Inst = dyn_cast<Instruction>(V))
1922        if (L->contains(Inst)) continue;
1923      for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
1924           UI != UE; ++UI) {
1925        const Instruction *UserInst = dyn_cast<Instruction>(*UI);
1926        // Ignore non-instructions.
1927        if (!UserInst)
1928          continue;
1929        // Ignore instructions in other functions (as can happen with
1930        // Constants).
1931        if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
1932          continue;
1933        // Ignore instructions not dominated by the loop.
1934        const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
1935          UserInst->getParent() :
1936          cast<PHINode>(UserInst)->getIncomingBlock(
1937            PHINode::getIncomingValueNumForOperand(UI.getOperandNo()));
1938        if (!DT.dominates(L->getHeader(), UseBB))
1939          continue;
1940        // Ignore uses which are part of other SCEV expressions, to avoid
1941        // analyzing them multiple times.
1942        if (SE.isSCEVable(UserInst->getType()) &&
1943            !isa<SCEVUnknown>(SE.getSCEV(const_cast<Instruction *>(UserInst))))
1944          continue;
1945        // Ignore icmp instructions which are already being analyzed.
1946        if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
1947          unsigned OtherIdx = !UI.getOperandNo();
1948          Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
1949          if (SE.getSCEV(OtherOp)->hasComputableLoopEvolution(L))
1950            continue;
1951        }
1952
1953        LSRFixup &LF = getNewFixup();
1954        LF.UserInst = const_cast<Instruction *>(UserInst);
1955        LF.OperandValToReplace = UI.getUse();
1956        std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, 0);
1957        LF.LUIdx = P.first;
1958        LF.Offset = P.second;
1959        LSRUse &LU = Uses[LF.LUIdx];
1960        LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
1961        InsertSupplementalFormula(U, LU, LF.LUIdx);
1962        CountRegisters(LU.Formulae.back(), Uses.size() - 1);
1963        break;
1964      }
1965    }
1966  }
1967}
1968
1969/// CollectSubexprs - Split S into subexpressions which can be pulled out into
1970/// separate registers. If C is non-null, multiply each subexpression by C.
1971static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
1972                            SmallVectorImpl<const SCEV *> &Ops,
1973                            ScalarEvolution &SE) {
1974  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1975    // Break out add operands.
1976    for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
1977         I != E; ++I)
1978      CollectSubexprs(*I, C, Ops, SE);
1979    return;
1980  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1981    // Split a non-zero base out of an addrec.
1982    if (!AR->getStart()->isZero()) {
1983      CollectSubexprs(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()),
1984                                       AR->getStepRecurrence(SE),
1985                                       AR->getLoop()), C, Ops, SE);
1986      CollectSubexprs(AR->getStart(), C, Ops, SE);
1987      return;
1988    }
1989  } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
1990    // Break (C * (a + b + c)) into C*a + C*b + C*c.
1991    if (Mul->getNumOperands() == 2)
1992      if (const SCEVConstant *Op0 =
1993            dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
1994        CollectSubexprs(Mul->getOperand(1),
1995                        C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0,
1996                        Ops, SE);
1997        return;
1998      }
1999  }
2000
2001  // Otherwise use the value itself.
2002  Ops.push_back(C ? SE.getMulExpr(C, S) : S);
2003}
2004
2005/// GenerateReassociations - Split out subexpressions from adds and the bases of
2006/// addrecs.
2007void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
2008                                         Formula Base,
2009                                         unsigned Depth) {
2010  // Arbitrarily cap recursion to protect compile time.
2011  if (Depth >= 3) return;
2012
2013  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
2014    const SCEV *BaseReg = Base.BaseRegs[i];
2015
2016    SmallVector<const SCEV *, 8> AddOps;
2017    CollectSubexprs(BaseReg, 0, AddOps, SE);
2018    if (AddOps.size() == 1) continue;
2019
2020    for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
2021         JE = AddOps.end(); J != JE; ++J) {
2022      // Don't pull a constant into a register if the constant could be folded
2023      // into an immediate field.
2024      if (isAlwaysFoldable(*J, LU.MinOffset, LU.MaxOffset,
2025                           Base.getNumRegs() > 1,
2026                           LU.Kind, LU.AccessTy, TLI, SE))
2027        continue;
2028
2029      // Collect all operands except *J.
2030      SmallVector<const SCEV *, 8> InnerAddOps;
2031      for (SmallVectorImpl<const SCEV *>::const_iterator K = AddOps.begin(),
2032           KE = AddOps.end(); K != KE; ++K)
2033        if (K != J)
2034          InnerAddOps.push_back(*K);
2035
2036      // Don't leave just a constant behind in a register if the constant could
2037      // be folded into an immediate field.
2038      if (InnerAddOps.size() == 1 &&
2039          isAlwaysFoldable(InnerAddOps[0], LU.MinOffset, LU.MaxOffset,
2040                           Base.getNumRegs() > 1,
2041                           LU.Kind, LU.AccessTy, TLI, SE))
2042        continue;
2043
2044      Formula F = Base;
2045      F.BaseRegs[i] = SE.getAddExpr(InnerAddOps);
2046      F.BaseRegs.push_back(*J);
2047      if (InsertFormula(LU, LUIdx, F))
2048        // If that formula hadn't been seen before, recurse to find more like
2049        // it.
2050        GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth+1);
2051    }
2052  }
2053}
2054
2055/// GenerateCombinations - Generate a formula consisting of all of the
2056/// loop-dominating registers added into a single register.
2057void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
2058                                       Formula Base) {
2059  // This method is only interesting on a plurality of registers.
2060  if (Base.BaseRegs.size() <= 1) return;
2061
2062  Formula F = Base;
2063  F.BaseRegs.clear();
2064  SmallVector<const SCEV *, 4> Ops;
2065  for (SmallVectorImpl<const SCEV *>::const_iterator
2066       I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) {
2067    const SCEV *BaseReg = *I;
2068    if (BaseReg->properlyDominates(L->getHeader(), &DT) &&
2069        !BaseReg->hasComputableLoopEvolution(L))
2070      Ops.push_back(BaseReg);
2071    else
2072      F.BaseRegs.push_back(BaseReg);
2073  }
2074  if (Ops.size() > 1) {
2075    const SCEV *Sum = SE.getAddExpr(Ops);
2076    // TODO: If Sum is zero, it probably means ScalarEvolution missed an
2077    // opportunity to fold something. For now, just ignore such cases
2078    // rather than proceed with zero in a register.
2079    if (!Sum->isZero()) {
2080      F.BaseRegs.push_back(Sum);
2081      (void)InsertFormula(LU, LUIdx, F);
2082    }
2083  }
2084}
2085
2086/// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets.
2087void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
2088                                          Formula Base) {
2089  // We can't add a symbolic offset if the address already contains one.
2090  if (Base.AM.BaseGV) return;
2091
2092  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
2093    const SCEV *G = Base.BaseRegs[i];
2094    GlobalValue *GV = ExtractSymbol(G, SE);
2095    if (G->isZero() || !GV)
2096      continue;
2097    Formula F = Base;
2098    F.AM.BaseGV = GV;
2099    if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
2100                    LU.Kind, LU.AccessTy, TLI))
2101      continue;
2102    F.BaseRegs[i] = G;
2103    (void)InsertFormula(LU, LUIdx, F);
2104  }
2105}
2106
2107/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
2108void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
2109                                          Formula Base) {
2110  // TODO: For now, just add the min and max offset, because it usually isn't
2111  // worthwhile looking at everything inbetween.
2112  SmallVector<int64_t, 4> Worklist;
2113  Worklist.push_back(LU.MinOffset);
2114  if (LU.MaxOffset != LU.MinOffset)
2115    Worklist.push_back(LU.MaxOffset);
2116
2117  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
2118    const SCEV *G = Base.BaseRegs[i];
2119
2120    for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
2121         E = Worklist.end(); I != E; ++I) {
2122      Formula F = Base;
2123      F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I;
2124      if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I,
2125                     LU.Kind, LU.AccessTy, TLI)) {
2126        F.BaseRegs[i] = SE.getAddExpr(G, SE.getIntegerSCEV(*I, G->getType()));
2127
2128        (void)InsertFormula(LU, LUIdx, F);
2129      }
2130    }
2131
2132    int64_t Imm = ExtractImmediate(G, SE);
2133    if (G->isZero() || Imm == 0)
2134      continue;
2135    Formula F = Base;
2136    F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Imm;
2137    if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
2138                    LU.Kind, LU.AccessTy, TLI))
2139      continue;
2140    F.BaseRegs[i] = G;
2141    (void)InsertFormula(LU, LUIdx, F);
2142  }
2143}
2144
2145/// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up
2146/// the comparison. For example, x == y -> x*c == y*c.
2147void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
2148                                         Formula Base) {
2149  if (LU.Kind != LSRUse::ICmpZero) return;
2150
2151  // Determine the integer type for the base formula.
2152  const Type *IntTy = Base.getType();
2153  if (!IntTy) return;
2154  if (SE.getTypeSizeInBits(IntTy) > 64) return;
2155
2156  // Don't do this if there is more than one offset.
2157  if (LU.MinOffset != LU.MaxOffset) return;
2158
2159  assert(!Base.AM.BaseGV && "ICmpZero use is not legal!");
2160
2161  // Check each interesting stride.
2162  for (SmallSetVector<int64_t, 8>::const_iterator
2163       I = Factors.begin(), E = Factors.end(); I != E; ++I) {
2164    int64_t Factor = *I;
2165    Formula F = Base;
2166
2167    // Check that the multiplication doesn't overflow.
2168    if (F.AM.BaseOffs == INT64_MIN && Factor == -1)
2169      continue;
2170    F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs * Factor;
2171    if (F.AM.BaseOffs / Factor != Base.AM.BaseOffs)
2172      continue;
2173
2174    // Check that multiplying with the use offset doesn't overflow.
2175    int64_t Offset = LU.MinOffset;
2176    if (Offset == INT64_MIN && Factor == -1)
2177      continue;
2178    Offset = (uint64_t)Offset * Factor;
2179    if (Offset / Factor != LU.MinOffset)
2180      continue;
2181
2182    // Check that this scale is legal.
2183    if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI))
2184      continue;
2185
2186    // Compensate for the use having MinOffset built into it.
2187    F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Offset - LU.MinOffset;
2188
2189    const SCEV *FactorS = SE.getIntegerSCEV(Factor, IntTy);
2190
2191    // Check that multiplying with each base register doesn't overflow.
2192    for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
2193      F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
2194      if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
2195        goto next;
2196    }
2197
2198    // Check that multiplying with the scaled register doesn't overflow.
2199    if (F.ScaledReg) {
2200      F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
2201      if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
2202        continue;
2203    }
2204
2205    // If we make it here and it's legal, add it.
2206    (void)InsertFormula(LU, LUIdx, F);
2207  next:;
2208  }
2209}
2210
2211/// GenerateScales - Generate stride factor reuse formulae by making use of
2212/// scaled-offset address modes, for example.
2213void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx,
2214                                 Formula Base) {
2215  // Determine the integer type for the base formula.
2216  const Type *IntTy = Base.getType();
2217  if (!IntTy) return;
2218
2219  // If this Formula already has a scaled register, we can't add another one.
2220  if (Base.AM.Scale != 0) return;
2221
2222  // Check each interesting stride.
2223  for (SmallSetVector<int64_t, 8>::const_iterator
2224       I = Factors.begin(), E = Factors.end(); I != E; ++I) {
2225    int64_t Factor = *I;
2226
2227    Base.AM.Scale = Factor;
2228    Base.AM.HasBaseReg = Base.BaseRegs.size() > 1;
2229    // Check whether this scale is going to be legal.
2230    if (!isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
2231                    LU.Kind, LU.AccessTy, TLI)) {
2232      // As a special-case, handle special out-of-loop Basic users specially.
2233      // TODO: Reconsider this special case.
2234      if (LU.Kind == LSRUse::Basic &&
2235          isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
2236                     LSRUse::Special, LU.AccessTy, TLI) &&
2237          LU.AllFixupsOutsideLoop)
2238        LU.Kind = LSRUse::Special;
2239      else
2240        continue;
2241    }
2242    // For an ICmpZero, negating a solitary base register won't lead to
2243    // new solutions.
2244    if (LU.Kind == LSRUse::ICmpZero &&
2245        !Base.AM.HasBaseReg && Base.AM.BaseOffs == 0 && !Base.AM.BaseGV)
2246      continue;
2247    // For each addrec base reg, apply the scale, if possible.
2248    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
2249      if (const SCEVAddRecExpr *AR =
2250            dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
2251        const SCEV *FactorS = SE.getIntegerSCEV(Factor, IntTy);
2252        if (FactorS->isZero())
2253          continue;
2254        // Divide out the factor, ignoring high bits, since we'll be
2255        // scaling the value back up in the end.
2256        if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
2257          // TODO: This could be optimized to avoid all the copying.
2258          Formula F = Base;
2259          F.ScaledReg = Quotient;
2260          std::swap(F.BaseRegs[i], F.BaseRegs.back());
2261          F.BaseRegs.pop_back();
2262          (void)InsertFormula(LU, LUIdx, F);
2263        }
2264      }
2265  }
2266}
2267
2268/// GenerateTruncates - Generate reuse formulae from different IV types.
2269void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx,
2270                                    Formula Base) {
2271  // This requires TargetLowering to tell us which truncates are free.
2272  if (!TLI) return;
2273
2274  // Don't bother truncating symbolic values.
2275  if (Base.AM.BaseGV) return;
2276
2277  // Determine the integer type for the base formula.
2278  const Type *DstTy = Base.getType();
2279  if (!DstTy) return;
2280  DstTy = SE.getEffectiveSCEVType(DstTy);
2281
2282  for (SmallSetVector<const Type *, 4>::const_iterator
2283       I = Types.begin(), E = Types.end(); I != E; ++I) {
2284    const Type *SrcTy = *I;
2285    if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) {
2286      Formula F = Base;
2287
2288      if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
2289      for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(),
2290           JE = F.BaseRegs.end(); J != JE; ++J)
2291        *J = SE.getAnyExtendExpr(*J, SrcTy);
2292
2293      // TODO: This assumes we've done basic processing on all uses and
2294      // have an idea what the register usage is.
2295      if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
2296        continue;
2297
2298      (void)InsertFormula(LU, LUIdx, F);
2299    }
2300  }
2301}
2302
2303namespace {
2304
2305/// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to
2306/// defer modifications so that the search phase doesn't have to worry about
2307/// the data structures moving underneath it.
2308struct WorkItem {
2309  size_t LUIdx;
2310  int64_t Imm;
2311  const SCEV *OrigReg;
2312
2313  WorkItem(size_t LI, int64_t I, const SCEV *R)
2314    : LUIdx(LI), Imm(I), OrigReg(R) {}
2315
2316  void print(raw_ostream &OS) const;
2317  void dump() const;
2318};
2319
2320}
2321
2322void WorkItem::print(raw_ostream &OS) const {
2323  OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
2324     << " , add offset " << Imm;
2325}
2326
2327void WorkItem::dump() const {
2328  print(errs()); errs() << '\n';
2329}
2330
2331/// GenerateCrossUseConstantOffsets - Look for registers which are a constant
2332/// distance apart and try to form reuse opportunities between them.
2333void LSRInstance::GenerateCrossUseConstantOffsets() {
2334  // Group the registers by their value without any added constant offset.
2335  typedef std::map<int64_t, const SCEV *> ImmMapTy;
2336  typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy;
2337  RegMapTy Map;
2338  DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
2339  SmallVector<const SCEV *, 8> Sequence;
2340  for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
2341       I != E; ++I) {
2342    const SCEV *Reg = *I;
2343    int64_t Imm = ExtractImmediate(Reg, SE);
2344    std::pair<RegMapTy::iterator, bool> Pair =
2345      Map.insert(std::make_pair(Reg, ImmMapTy()));
2346    if (Pair.second)
2347      Sequence.push_back(Reg);
2348    Pair.first->second.insert(std::make_pair(Imm, *I));
2349    UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(*I);
2350  }
2351
2352  // Now examine each set of registers with the same base value. Build up
2353  // a list of work to do and do the work in a separate step so that we're
2354  // not adding formulae and register counts while we're searching.
2355  SmallVector<WorkItem, 32> WorkItems;
2356  SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
2357  for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
2358       E = Sequence.end(); I != E; ++I) {
2359    const SCEV *Reg = *I;
2360    const ImmMapTy &Imms = Map.find(Reg)->second;
2361
2362    // It's not worthwhile looking for reuse if there's only one offset.
2363    if (Imms.size() == 1)
2364      continue;
2365
2366    DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
2367          for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
2368               J != JE; ++J)
2369            dbgs() << ' ' << J->first;
2370          dbgs() << '\n');
2371
2372    // Examine each offset.
2373    for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
2374         J != JE; ++J) {
2375      const SCEV *OrigReg = J->second;
2376
2377      int64_t JImm = J->first;
2378      const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
2379
2380      if (!isa<SCEVConstant>(OrigReg) &&
2381          UsedByIndicesMap[Reg].count() == 1) {
2382        DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
2383        continue;
2384      }
2385
2386      // Conservatively examine offsets between this orig reg a few selected
2387      // other orig regs.
2388      ImmMapTy::const_iterator OtherImms[] = {
2389        Imms.begin(), prior(Imms.end()),
2390        Imms.upper_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
2391      };
2392      for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
2393        ImmMapTy::const_iterator M = OtherImms[i];
2394        if (M == J || M == JE) continue;
2395
2396        // Compute the difference between the two.
2397        int64_t Imm = (uint64_t)JImm - M->first;
2398        for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
2399             LUIdx = UsedByIndices.find_next(LUIdx))
2400          // Make a memo of this use, offset, and register tuple.
2401          if (UniqueItems.insert(std::make_pair(LUIdx, Imm)))
2402            WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
2403      }
2404    }
2405  }
2406
2407  Map.clear();
2408  Sequence.clear();
2409  UsedByIndicesMap.clear();
2410  UniqueItems.clear();
2411
2412  // Now iterate through the worklist and add new formulae.
2413  for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(),
2414       E = WorkItems.end(); I != E; ++I) {
2415    const WorkItem &WI = *I;
2416    size_t LUIdx = WI.LUIdx;
2417    LSRUse &LU = Uses[LUIdx];
2418    int64_t Imm = WI.Imm;
2419    const SCEV *OrigReg = WI.OrigReg;
2420
2421    const Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
2422    const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
2423    unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
2424
2425    // TODO: Use a more targeted data structure.
2426    for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
2427      Formula F = LU.Formulae[L];
2428      // Use the immediate in the scaled register.
2429      if (F.ScaledReg == OrigReg) {
2430        int64_t Offs = (uint64_t)F.AM.BaseOffs +
2431                       Imm * (uint64_t)F.AM.Scale;
2432        // Don't create 50 + reg(-50).
2433        if (F.referencesReg(SE.getSCEV(
2434                   ConstantInt::get(IntTy, -(uint64_t)Offs))))
2435          continue;
2436        Formula NewF = F;
2437        NewF.AM.BaseOffs = Offs;
2438        if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
2439                        LU.Kind, LU.AccessTy, TLI))
2440          continue;
2441        NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
2442
2443        // If the new scale is a constant in a register, and adding the constant
2444        // value to the immediate would produce a value closer to zero than the
2445        // immediate itself, then the formula isn't worthwhile.
2446        if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
2447          if (C->getValue()->getValue().isNegative() !=
2448                (NewF.AM.BaseOffs < 0) &&
2449              (C->getValue()->getValue().abs() * APInt(BitWidth, F.AM.Scale))
2450                .ule(APInt(BitWidth, NewF.AM.BaseOffs).abs()))
2451            continue;
2452
2453        // OK, looks good.
2454        (void)InsertFormula(LU, LUIdx, NewF);
2455      } else {
2456        // Use the immediate in a base register.
2457        for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
2458          const SCEV *BaseReg = F.BaseRegs[N];
2459          if (BaseReg != OrigReg)
2460            continue;
2461          Formula NewF = F;
2462          NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
2463          if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
2464                          LU.Kind, LU.AccessTy, TLI))
2465            continue;
2466          NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
2467
2468          // If the new formula has a constant in a register, and adding the
2469          // constant value to the immediate would produce a value closer to
2470          // zero than the immediate itself, then the formula isn't worthwhile.
2471          for (SmallVectorImpl<const SCEV *>::const_iterator
2472               J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end();
2473               J != JE; ++J)
2474            if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J))
2475              if (C->getValue()->getValue().isNegative() !=
2476                    (NewF.AM.BaseOffs < 0) &&
2477                  C->getValue()->getValue().abs()
2478                    .ule(APInt(BitWidth, NewF.AM.BaseOffs).abs()))
2479                goto skip_formula;
2480
2481          // Ok, looks good.
2482          (void)InsertFormula(LU, LUIdx, NewF);
2483          break;
2484        skip_formula:;
2485        }
2486      }
2487    }
2488  }
2489}
2490
2491/// GenerateAllReuseFormulae - Generate formulae for each use.
2492void
2493LSRInstance::GenerateAllReuseFormulae() {
2494  // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
2495  // queries are more precise.
2496  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2497    LSRUse &LU = Uses[LUIdx];
2498    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2499      GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
2500    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2501      GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
2502  }
2503  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2504    LSRUse &LU = Uses[LUIdx];
2505    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2506      GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
2507    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2508      GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
2509    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2510      GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
2511    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2512      GenerateScales(LU, LUIdx, LU.Formulae[i]);
2513  }
2514  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2515    LSRUse &LU = Uses[LUIdx];
2516    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2517      GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
2518  }
2519
2520  GenerateCrossUseConstantOffsets();
2521}
2522
2523/// If their are multiple formulae with the same set of registers used
2524/// by other uses, pick the best one and delete the others.
2525void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
2526#ifndef NDEBUG
2527  bool Changed = false;
2528#endif
2529
2530  // Collect the best formula for each unique set of shared registers. This
2531  // is reset for each use.
2532  typedef DenseMap<SmallVector<const SCEV *, 2>, size_t, UniquifierDenseMapInfo>
2533    BestFormulaeTy;
2534  BestFormulaeTy BestFormulae;
2535
2536  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2537    LSRUse &LU = Uses[LUIdx];
2538    FormulaSorter Sorter(L, LU, SE, DT);
2539
2540    // Clear out the set of used regs; it will be recomputed.
2541    LU.Regs.clear();
2542
2543    for (size_t FIdx = 0, NumForms = LU.Formulae.size();
2544         FIdx != NumForms; ++FIdx) {
2545      Formula &F = LU.Formulae[FIdx];
2546
2547      SmallVector<const SCEV *, 2> Key;
2548      for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
2549           JE = F.BaseRegs.end(); J != JE; ++J) {
2550        const SCEV *Reg = *J;
2551        if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
2552          Key.push_back(Reg);
2553      }
2554      if (F.ScaledReg &&
2555          RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
2556        Key.push_back(F.ScaledReg);
2557      // Unstable sort by host order ok, because this is only used for
2558      // uniquifying.
2559      std::sort(Key.begin(), Key.end());
2560
2561      std::pair<BestFormulaeTy::const_iterator, bool> P =
2562        BestFormulae.insert(std::make_pair(Key, FIdx));
2563      if (!P.second) {
2564        Formula &Best = LU.Formulae[P.first->second];
2565        if (Sorter.operator()(F, Best))
2566          std::swap(F, Best);
2567        DEBUG(dbgs() << "Filtering out "; F.print(dbgs());
2568              dbgs() << "\n"
2569                        "  in favor of "; Best.print(dbgs());
2570              dbgs() << '\n');
2571#ifndef NDEBUG
2572        Changed = true;
2573#endif
2574        std::swap(F, LU.Formulae.back());
2575        LU.Formulae.pop_back();
2576        --FIdx;
2577        --NumForms;
2578        continue;
2579      }
2580      if (F.ScaledReg) LU.Regs.insert(F.ScaledReg);
2581      LU.Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
2582    }
2583    BestFormulae.clear();
2584  }
2585
2586  DEBUG(if (Changed) {
2587          dbgs() << "\n"
2588                    "After filtering out undesirable candidates:\n";
2589          print_uses(dbgs());
2590        });
2591}
2592
2593/// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of
2594/// formulae to choose from, use some rough heuristics to prune down the number
2595/// of formulae. This keeps the main solver from taking an extraordinary amount
2596/// of time in some worst-case scenarios.
2597void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
2598  // This is a rough guess that seems to work fairly well.
2599  const size_t Limit = UINT16_MAX;
2600
2601  SmallPtrSet<const SCEV *, 4> Taken;
2602  for (;;) {
2603    // Estimate the worst-case number of solutions we might consider. We almost
2604    // never consider this many solutions because we prune the search space,
2605    // but the pruning isn't always sufficient.
2606    uint32_t Power = 1;
2607    for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
2608         E = Uses.end(); I != E; ++I) {
2609      size_t FSize = I->Formulae.size();
2610      if (FSize >= Limit) {
2611        Power = Limit;
2612        break;
2613      }
2614      Power *= FSize;
2615      if (Power >= Limit)
2616        break;
2617    }
2618    if (Power < Limit)
2619      break;
2620
2621    // Ok, we have too many of formulae on our hands to conveniently handle.
2622    // Use a rough heuristic to thin out the list.
2623
2624    // Pick the register which is used by the most LSRUses, which is likely
2625    // to be a good reuse register candidate.
2626    const SCEV *Best = 0;
2627    unsigned BestNum = 0;
2628    for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
2629         I != E; ++I) {
2630      const SCEV *Reg = *I;
2631      if (Taken.count(Reg))
2632        continue;
2633      if (!Best)
2634        Best = Reg;
2635      else {
2636        unsigned Count = RegUses.getUsedByIndices(Reg).count();
2637        if (Count > BestNum) {
2638          Best = Reg;
2639          BestNum = Count;
2640        }
2641      }
2642    }
2643
2644    DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
2645                 << " will yield profitable reuse.\n");
2646    Taken.insert(Best);
2647
2648    // In any use with formulae which references this register, delete formulae
2649    // which don't reference it.
2650    for (SmallVectorImpl<LSRUse>::iterator I = Uses.begin(),
2651         E = Uses.end(); I != E; ++I) {
2652      LSRUse &LU = *I;
2653      if (!LU.Regs.count(Best)) continue;
2654
2655      // Clear out the set of used regs; it will be recomputed.
2656      LU.Regs.clear();
2657
2658      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
2659        Formula &F = LU.Formulae[i];
2660        if (!F.referencesReg(Best)) {
2661          DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
2662          std::swap(LU.Formulae.back(), F);
2663          LU.Formulae.pop_back();
2664          --e;
2665          --i;
2666          continue;
2667        }
2668
2669        if (F.ScaledReg) LU.Regs.insert(F.ScaledReg);
2670        LU.Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
2671      }
2672    }
2673
2674    DEBUG(dbgs() << "After pre-selection:\n";
2675          print_uses(dbgs()));
2676  }
2677}
2678
2679/// SolveRecurse - This is the recursive solver.
2680void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2681                               Cost &SolutionCost,
2682                               SmallVectorImpl<const Formula *> &Workspace,
2683                               const Cost &CurCost,
2684                               const SmallPtrSet<const SCEV *, 16> &CurRegs,
2685                               DenseSet<const SCEV *> &VisitedRegs) const {
2686  // Some ideas:
2687  //  - prune more:
2688  //    - use more aggressive filtering
2689  //    - sort the formula so that the most profitable solutions are found first
2690  //    - sort the uses too
2691  //  - search faster:
2692  //    - don't compute a cost, and then compare. compare while computing a cost
2693  //      and bail early.
2694  //    - track register sets with SmallBitVector
2695
2696  const LSRUse &LU = Uses[Workspace.size()];
2697
2698  // If this use references any register that's already a part of the
2699  // in-progress solution, consider it a requirement that a formula must
2700  // reference that register in order to be considered. This prunes out
2701  // unprofitable searching.
2702  SmallSetVector<const SCEV *, 4> ReqRegs;
2703  for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(),
2704       E = CurRegs.end(); I != E; ++I)
2705    if (LU.Regs.count(*I))
2706      ReqRegs.insert(*I);
2707
2708  bool AnySatisfiedReqRegs = false;
2709  SmallPtrSet<const SCEV *, 16> NewRegs;
2710  Cost NewCost;
2711retry:
2712  for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
2713       E = LU.Formulae.end(); I != E; ++I) {
2714    const Formula &F = *I;
2715
2716    // Ignore formulae which do not use any of the required registers.
2717    for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(),
2718         JE = ReqRegs.end(); J != JE; ++J) {
2719      const SCEV *Reg = *J;
2720      if ((!F.ScaledReg || F.ScaledReg != Reg) &&
2721          std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) ==
2722          F.BaseRegs.end())
2723        goto skip;
2724    }
2725    AnySatisfiedReqRegs = true;
2726
2727    // Evaluate the cost of the current formula. If it's already worse than
2728    // the current best, prune the search at that point.
2729    NewCost = CurCost;
2730    NewRegs = CurRegs;
2731    NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT);
2732    if (NewCost < SolutionCost) {
2733      Workspace.push_back(&F);
2734      if (Workspace.size() != Uses.size()) {
2735        SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
2736                     NewRegs, VisitedRegs);
2737        if (F.getNumRegs() == 1 && Workspace.size() == 1)
2738          VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
2739      } else {
2740        DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
2741              dbgs() << ". Regs:";
2742              for (SmallPtrSet<const SCEV *, 16>::const_iterator
2743                   I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I)
2744                dbgs() << ' ' << **I;
2745              dbgs() << '\n');
2746
2747        SolutionCost = NewCost;
2748        Solution = Workspace;
2749      }
2750      Workspace.pop_back();
2751    }
2752  skip:;
2753  }
2754
2755  // If none of the formulae had all of the required registers, relax the
2756  // constraint so that we don't exclude all formulae.
2757  if (!AnySatisfiedReqRegs) {
2758    ReqRegs.clear();
2759    goto retry;
2760  }
2761}
2762
2763void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
2764  SmallVector<const Formula *, 8> Workspace;
2765  Cost SolutionCost;
2766  SolutionCost.Loose();
2767  Cost CurCost;
2768  SmallPtrSet<const SCEV *, 16> CurRegs;
2769  DenseSet<const SCEV *> VisitedRegs;
2770  Workspace.reserve(Uses.size());
2771
2772  SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
2773               CurRegs, VisitedRegs);
2774
2775  // Ok, we've now made all our decisions.
2776  DEBUG(dbgs() << "\n"
2777                  "The chosen solution requires "; SolutionCost.print(dbgs());
2778        dbgs() << ":\n";
2779        for (size_t i = 0, e = Uses.size(); i != e; ++i) {
2780          dbgs() << "  ";
2781          Uses[i].print(dbgs());
2782          dbgs() << "\n"
2783                    "    ";
2784          Solution[i]->print(dbgs());
2785          dbgs() << '\n';
2786        });
2787}
2788
2789/// getImmediateDominator - A handy utility for the specific DominatorTree
2790/// query that we need here.
2791///
2792static BasicBlock *getImmediateDominator(BasicBlock *BB, DominatorTree &DT) {
2793  DomTreeNode *Node = DT.getNode(BB);
2794  if (!Node) return 0;
2795  Node = Node->getIDom();
2796  if (!Node) return 0;
2797  return Node->getBlock();
2798}
2799
2800Value *LSRInstance::Expand(const LSRFixup &LF,
2801                           const Formula &F,
2802                           BasicBlock::iterator IP,
2803                           SCEVExpander &Rewriter,
2804                           SmallVectorImpl<WeakVH> &DeadInsts) const {
2805  const LSRUse &LU = Uses[LF.LUIdx];
2806
2807  // Then, collect some instructions which must be dominated by the
2808  // expanding replacement. These must be dominated by any operands that
2809  // will be required in the expansion.
2810  SmallVector<Instruction *, 4> Inputs;
2811  if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
2812    Inputs.push_back(I);
2813  if (LU.Kind == LSRUse::ICmpZero)
2814    if (Instruction *I =
2815          dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
2816      Inputs.push_back(I);
2817  if (LF.PostIncLoops.count(L)) {
2818    if (LF.isUseFullyOutsideLoop(L))
2819      Inputs.push_back(L->getLoopLatch()->getTerminator());
2820    else
2821      Inputs.push_back(IVIncInsertPos);
2822  }
2823  // The expansion must also be dominated by the increment positions of any
2824  // loops it for which it is using post-inc mode.
2825  for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(),
2826       E = LF.PostIncLoops.end(); I != E; ++I) {
2827    const Loop *PIL = *I;
2828    if (PIL == L) continue;
2829
2830    SmallVector<BasicBlock *, 4> ExitingBlocks;
2831    PIL->getExitingBlocks(ExitingBlocks);
2832    if (!ExitingBlocks.empty()) {
2833      BasicBlock *BB = ExitingBlocks[0];
2834      for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
2835        BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
2836      Inputs.push_back(BB->getTerminator());
2837    }
2838  }
2839
2840  // Then, climb up the immediate dominator tree as far as we can go while
2841  // still being dominated by the input positions.
2842  for (;;) {
2843    bool AllDominate = true;
2844    Instruction *BetterPos = 0;
2845    BasicBlock *IDom = getImmediateDominator(IP->getParent(), DT);
2846    if (!IDom) break;
2847    Instruction *Tentative = IDom->getTerminator();
2848    for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),
2849         E = Inputs.end(); I != E; ++I) {
2850      Instruction *Inst = *I;
2851      if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
2852        AllDominate = false;
2853        break;
2854      }
2855      if (IDom == Inst->getParent() &&
2856          (!BetterPos || DT.dominates(BetterPos, Inst)))
2857        BetterPos = next(BasicBlock::iterator(Inst));
2858    }
2859    if (!AllDominate)
2860      break;
2861    if (BetterPos)
2862      IP = BetterPos;
2863    else
2864      IP = Tentative;
2865  }
2866  while (isa<PHINode>(IP)) ++IP;
2867  while (isa<DbgInfoIntrinsic>(IP)) ++IP;
2868
2869  // Inform the Rewriter if we have a post-increment use, so that it can
2870  // perform an advantageous expansion.
2871  Rewriter.setPostInc(LF.PostIncLoops);
2872
2873  // This is the type that the user actually needs.
2874  const Type *OpTy = LF.OperandValToReplace->getType();
2875  // This will be the type that we'll initially expand to.
2876  const Type *Ty = F.getType();
2877  if (!Ty)
2878    // No type known; just expand directly to the ultimate type.
2879    Ty = OpTy;
2880  else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
2881    // Expand directly to the ultimate type if it's the right size.
2882    Ty = OpTy;
2883  // This is the type to do integer arithmetic in.
2884  const Type *IntTy = SE.getEffectiveSCEVType(Ty);
2885
2886  // Build up a list of operands to add together to form the full base.
2887  SmallVector<const SCEV *, 8> Ops;
2888
2889  // Expand the BaseRegs portion.
2890  for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
2891       E = F.BaseRegs.end(); I != E; ++I) {
2892    const SCEV *Reg = *I;
2893    assert(!Reg->isZero() && "Zero allocated in a base register!");
2894
2895    // If we're expanding for a post-inc user, make the post-inc adjustment.
2896    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
2897    Reg = TransformForPostIncUse(Denormalize, Reg,
2898                                 LF.UserInst, LF.OperandValToReplace,
2899                                 Loops, SE, DT);
2900
2901    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
2902  }
2903
2904  // Flush the operand list to suppress SCEVExpander hoisting.
2905  if (!Ops.empty()) {
2906    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
2907    Ops.clear();
2908    Ops.push_back(SE.getUnknown(FullV));
2909  }
2910
2911  // Expand the ScaledReg portion.
2912  Value *ICmpScaledV = 0;
2913  if (F.AM.Scale != 0) {
2914    const SCEV *ScaledS = F.ScaledReg;
2915
2916    // If we're expanding for a post-inc user, make the post-inc adjustment.
2917    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
2918    ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
2919                                     LF.UserInst, LF.OperandValToReplace,
2920                                     Loops, SE, DT);
2921
2922    if (LU.Kind == LSRUse::ICmpZero) {
2923      // An interesting way of "folding" with an icmp is to use a negated
2924      // scale, which we'll implement by inserting it into the other operand
2925      // of the icmp.
2926      assert(F.AM.Scale == -1 &&
2927             "The only scale supported by ICmpZero uses is -1!");
2928      ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP);
2929    } else {
2930      // Otherwise just expand the scaled register and an explicit scale,
2931      // which is expected to be matched as part of the address.
2932      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
2933      ScaledS = SE.getMulExpr(ScaledS,
2934                              SE.getIntegerSCEV(F.AM.Scale,
2935                                                ScaledS->getType()));
2936      Ops.push_back(ScaledS);
2937
2938      // Flush the operand list to suppress SCEVExpander hoisting.
2939      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
2940      Ops.clear();
2941      Ops.push_back(SE.getUnknown(FullV));
2942    }
2943  }
2944
2945  // Expand the GV portion.
2946  if (F.AM.BaseGV) {
2947    Ops.push_back(SE.getUnknown(F.AM.BaseGV));
2948
2949    // Flush the operand list to suppress SCEVExpander hoisting.
2950    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
2951    Ops.clear();
2952    Ops.push_back(SE.getUnknown(FullV));
2953  }
2954
2955  // Expand the immediate portion.
2956  int64_t Offset = (uint64_t)F.AM.BaseOffs + LF.Offset;
2957  if (Offset != 0) {
2958    if (LU.Kind == LSRUse::ICmpZero) {
2959      // The other interesting way of "folding" with an ICmpZero is to use a
2960      // negated immediate.
2961      if (!ICmpScaledV)
2962        ICmpScaledV = ConstantInt::get(IntTy, -Offset);
2963      else {
2964        Ops.push_back(SE.getUnknown(ICmpScaledV));
2965        ICmpScaledV = ConstantInt::get(IntTy, Offset);
2966      }
2967    } else {
2968      // Just add the immediate values. These again are expected to be matched
2969      // as part of the address.
2970      Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
2971    }
2972  }
2973
2974  // Emit instructions summing all the operands.
2975  const SCEV *FullS = Ops.empty() ?
2976                      SE.getIntegerSCEV(0, IntTy) :
2977                      SE.getAddExpr(Ops);
2978  Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);
2979
2980  // We're done expanding now, so reset the rewriter.
2981  Rewriter.clearPostInc();
2982
2983  // An ICmpZero Formula represents an ICmp which we're handling as a
2984  // comparison against zero. Now that we've expanded an expression for that
2985  // form, update the ICmp's other operand.
2986  if (LU.Kind == LSRUse::ICmpZero) {
2987    ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
2988    DeadInsts.push_back(CI->getOperand(1));
2989    assert(!F.AM.BaseGV && "ICmp does not support folding a global value and "
2990                           "a scale at the same time!");
2991    if (F.AM.Scale == -1) {
2992      if (ICmpScaledV->getType() != OpTy) {
2993        Instruction *Cast =
2994          CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
2995                                                   OpTy, false),
2996                           ICmpScaledV, OpTy, "tmp", CI);
2997        ICmpScaledV = Cast;
2998      }
2999      CI->setOperand(1, ICmpScaledV);
3000    } else {
3001      assert(F.AM.Scale == 0 &&
3002             "ICmp does not support folding a global value and "
3003             "a scale at the same time!");
3004      Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
3005                                           -(uint64_t)Offset);
3006      if (C->getType() != OpTy)
3007        C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
3008                                                          OpTy, false),
3009                                  C, OpTy);
3010
3011      CI->setOperand(1, C);
3012    }
3013  }
3014
3015  return FullV;
3016}
3017
3018/// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use
3019/// of their operands effectively happens in their predecessor blocks, so the
3020/// expression may need to be expanded in multiple places.
3021void LSRInstance::RewriteForPHI(PHINode *PN,
3022                                const LSRFixup &LF,
3023                                const Formula &F,
3024                                SCEVExpander &Rewriter,
3025                                SmallVectorImpl<WeakVH> &DeadInsts,
3026                                Pass *P) const {
3027  DenseMap<BasicBlock *, Value *> Inserted;
3028  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
3029    if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
3030      BasicBlock *BB = PN->getIncomingBlock(i);
3031
3032      // If this is a critical edge, split the edge so that we do not insert
3033      // the code on all predecessor/successor paths.  We do this unless this
3034      // is the canonical backedge for this loop, which complicates post-inc
3035      // users.
3036      if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
3037          !isa<IndirectBrInst>(BB->getTerminator()) &&
3038          (PN->getParent() != L->getHeader() || !L->contains(BB))) {
3039        // Split the critical edge.
3040        BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P);
3041
3042        // If PN is outside of the loop and BB is in the loop, we want to
3043        // move the block to be immediately before the PHI block, not
3044        // immediately after BB.
3045        if (L->contains(BB) && !L->contains(PN))
3046          NewBB->moveBefore(PN->getParent());
3047
3048        // Splitting the edge can reduce the number of PHI entries we have.
3049        e = PN->getNumIncomingValues();
3050        BB = NewBB;
3051        i = PN->getBasicBlockIndex(BB);
3052      }
3053
3054      std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
3055        Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
3056      if (!Pair.second)
3057        PN->setIncomingValue(i, Pair.first->second);
3058      else {
3059        Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts);
3060
3061        // If this is reuse-by-noop-cast, insert the noop cast.
3062        const Type *OpTy = LF.OperandValToReplace->getType();
3063        if (FullV->getType() != OpTy)
3064          FullV =
3065            CastInst::Create(CastInst::getCastOpcode(FullV, false,
3066                                                     OpTy, false),
3067                             FullV, LF.OperandValToReplace->getType(),
3068                             "tmp", BB->getTerminator());
3069
3070        PN->setIncomingValue(i, FullV);
3071        Pair.first->second = FullV;
3072      }
3073    }
3074}
3075
3076/// Rewrite - Emit instructions for the leading candidate expression for this
3077/// LSRUse (this is called "expanding"), and update the UserInst to reference
3078/// the newly expanded value.
3079void LSRInstance::Rewrite(const LSRFixup &LF,
3080                          const Formula &F,
3081                          SCEVExpander &Rewriter,
3082                          SmallVectorImpl<WeakVH> &DeadInsts,
3083                          Pass *P) const {
3084  // First, find an insertion point that dominates UserInst. For PHI nodes,
3085  // find the nearest block which dominates all the relevant uses.
3086  if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
3087    RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P);
3088  } else {
3089    Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts);
3090
3091    // If this is reuse-by-noop-cast, insert the noop cast.
3092    const Type *OpTy = LF.OperandValToReplace->getType();
3093    if (FullV->getType() != OpTy) {
3094      Instruction *Cast =
3095        CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
3096                         FullV, OpTy, "tmp", LF.UserInst);
3097      FullV = Cast;
3098    }
3099
3100    // Update the user. ICmpZero is handled specially here (for now) because
3101    // Expand may have updated one of the operands of the icmp already, and
3102    // its new value may happen to be equal to LF.OperandValToReplace, in
3103    // which case doing replaceUsesOfWith leads to replacing both operands
3104    // with the same value. TODO: Reorganize this.
3105    if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)
3106      LF.UserInst->setOperand(0, FullV);
3107    else
3108      LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
3109  }
3110
3111  DeadInsts.push_back(LF.OperandValToReplace);
3112}
3113
3114void
3115LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
3116                               Pass *P) {
3117  // Keep track of instructions we may have made dead, so that
3118  // we can remove them after we are done working.
3119  SmallVector<WeakVH, 16> DeadInsts;
3120
3121  SCEVExpander Rewriter(SE);
3122  Rewriter.disableCanonicalMode();
3123  Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
3124
3125  // Expand the new value definitions and update the users.
3126  for (size_t i = 0, e = Fixups.size(); i != e; ++i) {
3127    size_t LUIdx = Fixups[i].LUIdx;
3128
3129    Rewrite(Fixups[i], *Solution[LUIdx], Rewriter, DeadInsts, P);
3130
3131    Changed = true;
3132  }
3133
3134  // Clean up after ourselves. This must be done before deleting any
3135  // instructions.
3136  Rewriter.clear();
3137
3138  Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
3139}
3140
3141LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
3142  : IU(P->getAnalysis<IVUsers>()),
3143    SE(P->getAnalysis<ScalarEvolution>()),
3144    DT(P->getAnalysis<DominatorTree>()),
3145    TLI(tli), L(l), Changed(false), IVIncInsertPos(0) {
3146
3147  // If LoopSimplify form is not available, stay out of trouble.
3148  if (!L->isLoopSimplifyForm()) return;
3149
3150  // If there's no interesting work to be done, bail early.
3151  if (IU.empty()) return;
3152
3153  DEBUG(dbgs() << "\nLSR on loop ";
3154        WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
3155        dbgs() << ":\n");
3156
3157  /// OptimizeShadowIV - If IV is used in a int-to-float cast
3158  /// inside the loop then try to eliminate the cast operation.
3159  OptimizeShadowIV();
3160
3161  // Change loop terminating condition to use the postinc iv when possible.
3162  Changed |= OptimizeLoopTermCond();
3163
3164  CollectInterestingTypesAndFactors();
3165  CollectFixupsAndInitialFormulae();
3166  CollectLoopInvariantFixupsAndFormulae();
3167
3168  DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
3169        print_uses(dbgs()));
3170
3171  // Now use the reuse data to generate a bunch of interesting ways
3172  // to formulate the values needed for the uses.
3173  GenerateAllReuseFormulae();
3174
3175  DEBUG(dbgs() << "\n"
3176                  "After generating reuse formulae:\n";
3177        print_uses(dbgs()));
3178
3179  FilterOutUndesirableDedicatedRegisters();
3180  NarrowSearchSpaceUsingHeuristics();
3181
3182  SmallVector<const Formula *, 8> Solution;
3183  Solve(Solution);
3184  assert(Solution.size() == Uses.size() && "Malformed solution!");
3185
3186  // Release memory that is no longer needed.
3187  Factors.clear();
3188  Types.clear();
3189  RegUses.clear();
3190
3191#ifndef NDEBUG
3192  // Formulae should be legal.
3193  for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
3194       E = Uses.end(); I != E; ++I) {
3195     const LSRUse &LU = *I;
3196     for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
3197          JE = LU.Formulae.end(); J != JE; ++J)
3198        assert(isLegalUse(J->AM, LU.MinOffset, LU.MaxOffset,
3199                          LU.Kind, LU.AccessTy, TLI) &&
3200               "Illegal formula generated!");
3201  };
3202#endif
3203
3204  // Now that we've decided what we want, make it so.
3205  ImplementSolution(Solution, P);
3206}
3207
3208void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
3209  if (Factors.empty() && Types.empty()) return;
3210
3211  OS << "LSR has identified the following interesting factors and types: ";
3212  bool First = true;
3213
3214  for (SmallSetVector<int64_t, 8>::const_iterator
3215       I = Factors.begin(), E = Factors.end(); I != E; ++I) {
3216    if (!First) OS << ", ";
3217    First = false;
3218    OS << '*' << *I;
3219  }
3220
3221  for (SmallSetVector<const Type *, 4>::const_iterator
3222       I = Types.begin(), E = Types.end(); I != E; ++I) {
3223    if (!First) OS << ", ";
3224    First = false;
3225    OS << '(' << **I << ')';
3226  }
3227  OS << '\n';
3228}
3229
3230void LSRInstance::print_fixups(raw_ostream &OS) const {
3231  OS << "LSR is examining the following fixup sites:\n";
3232  for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
3233       E = Fixups.end(); I != E; ++I) {
3234    const LSRFixup &LF = *I;
3235    dbgs() << "  ";
3236    LF.print(OS);
3237    OS << '\n';
3238  }
3239}
3240
3241void LSRInstance::print_uses(raw_ostream &OS) const {
3242  OS << "LSR is examining the following uses:\n";
3243  for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
3244       E = Uses.end(); I != E; ++I) {
3245    const LSRUse &LU = *I;
3246    dbgs() << "  ";
3247    LU.print(OS);
3248    OS << '\n';
3249    for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
3250         JE = LU.Formulae.end(); J != JE; ++J) {
3251      OS << "    ";
3252      J->print(OS);
3253      OS << '\n';
3254    }
3255  }
3256}
3257
3258void LSRInstance::print(raw_ostream &OS) const {
3259  print_factors_and_types(OS);
3260  print_fixups(OS);
3261  print_uses(OS);
3262}
3263
3264void LSRInstance::dump() const {
3265  print(errs()); errs() << '\n';
3266}
3267
3268namespace {
3269
3270class LoopStrengthReduce : public LoopPass {
3271  /// TLI - Keep a pointer of a TargetLowering to consult for determining
3272  /// transformation profitability.
3273  const TargetLowering *const TLI;
3274
3275public:
3276  static char ID; // Pass ID, replacement for typeid
3277  explicit LoopStrengthReduce(const TargetLowering *tli = 0);
3278
3279private:
3280  bool runOnLoop(Loop *L, LPPassManager &LPM);
3281  void getAnalysisUsage(AnalysisUsage &AU) const;
3282};
3283
3284}
3285
3286char LoopStrengthReduce::ID = 0;
3287static RegisterPass<LoopStrengthReduce>
3288X("loop-reduce", "Loop Strength Reduction");
3289
3290Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
3291  return new LoopStrengthReduce(TLI);
3292}
3293
3294LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli)
3295  : LoopPass(&ID), TLI(tli) {}
3296
3297void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
3298  // We split critical edges, so we change the CFG.  However, we do update
3299  // many analyses if they are around.
3300  AU.addPreservedID(LoopSimplifyID);
3301  AU.addPreserved<LoopInfo>();
3302  AU.addPreserved("domfrontier");
3303
3304  AU.addRequiredID(LoopSimplifyID);
3305  AU.addRequired<DominatorTree>();
3306  AU.addPreserved<DominatorTree>();
3307  AU.addRequired<ScalarEvolution>();
3308  AU.addPreserved<ScalarEvolution>();
3309  AU.addRequired<IVUsers>();
3310  AU.addPreserved<IVUsers>();
3311}
3312
3313bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
3314  bool Changed = false;
3315
3316  // Run the main LSR transformation.
3317  Changed |= LSRInstance(TLI, L, this).getChanged();
3318
3319  // At this point, it is worth checking to see if any recurrence PHIs are also
3320  // dead, so that we can remove them as well.
3321  Changed |= DeleteDeadPHIs(L->getHeader());
3322
3323  return Changed;
3324}
3325