LoopStrengthReduce.cpp revision f182b23f8fed014c26da48d94890080c7d916a0c
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This transformation analyzes and transforms the induction variables (and
11// computations derived from them) into forms suitable for efficient execution
12// on the target.
13//
14// This pass performs a strength reduction on array references inside loops that
15// have as one or more of their components the loop induction variable, it
16// rewrites expressions to take advantage of scaled-index addressing modes
17// available on the target, and it performs a variety of other optimizations
18// related to loop induction variables.
19//
20// Terminology note: this code has a lot of handling for "post-increment" or
21// "post-inc" users. This is not talking about post-increment addressing modes;
22// it is instead talking about code like this:
23//
24//   %i = phi [ 0, %entry ], [ %i.next, %latch ]
25//   ...
26//   %i.next = add %i, 1
27//   %c = icmp eq %i.next, %n
28//
29// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
30// it's useful to think about these as the same register, with some uses using
31// the value of the register before the add and some using // it after. In this
32// example, the icmp is a post-increment user, since it uses %i.next, which is
33// the value of the induction variable after the increment. The other common
34// case of post-increment users is users outside the loop.
35//
36// TODO: More sophistication in the way Formulae are generated and filtered.
37//
38// TODO: Handle multiple loops at a time.
39//
40// TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr
41//       instead of a GlobalValue?
42//
43// TODO: When truncation is free, truncate ICmp users' operands to make it a
44//       smaller encoding (on x86 at least).
45//
46// TODO: When a negated register is used by an add (such as in a list of
47//       multiple base registers, or as the increment expression in an addrec),
48//       we may not actually need both reg and (-1 * reg) in registers; the
49//       negation can be implemented by using a sub instead of an add. The
50//       lack of support for taking this into consideration when making
51//       register pressure decisions is partly worked around by the "Special"
52//       use kind.
53//
54//===----------------------------------------------------------------------===//
55
56#define DEBUG_TYPE "loop-reduce"
57#include "llvm/Transforms/Scalar.h"
58#include "llvm/Constants.h"
59#include "llvm/Instructions.h"
60#include "llvm/IntrinsicInst.h"
61#include "llvm/DerivedTypes.h"
62#include "llvm/Analysis/IVUsers.h"
63#include "llvm/Analysis/Dominators.h"
64#include "llvm/Analysis/LoopPass.h"
65#include "llvm/Analysis/ScalarEvolutionExpander.h"
66#include "llvm/Transforms/Utils/BasicBlockUtils.h"
67#include "llvm/Transforms/Utils/Local.h"
68#include "llvm/ADT/SmallBitVector.h"
69#include "llvm/ADT/SetVector.h"
70#include "llvm/ADT/DenseSet.h"
71#include "llvm/Support/Debug.h"
72#include "llvm/Support/ValueHandle.h"
73#include "llvm/Support/raw_ostream.h"
74#include "llvm/Target/TargetLowering.h"
75#include <algorithm>
76using namespace llvm;
77
78namespace {
79
80/// RegSortData - This class holds data which is used to order reuse candidates.
81class RegSortData {
82public:
83  /// UsedByIndices - This represents the set of LSRUse indices which reference
84  /// a particular register.
85  SmallBitVector UsedByIndices;
86
87  RegSortData() {}
88
89  void print(raw_ostream &OS) const;
90  void dump() const;
91};
92
93}
94
95void RegSortData::print(raw_ostream &OS) const {
96  OS << "[NumUses=" << UsedByIndices.count() << ']';
97}
98
99void RegSortData::dump() const {
100  print(errs()); errs() << '\n';
101}
102
103namespace {
104
105/// RegUseTracker - Map register candidates to information about how they are
106/// used.
107class RegUseTracker {
108  typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
109
110  RegUsesTy RegUsesMap;
111  SmallVector<const SCEV *, 16> RegSequence;
112
113public:
114  void CountRegister(const SCEV *Reg, size_t LUIdx);
115  void DropRegister(const SCEV *Reg, size_t LUIdx);
116  void DropUse(size_t LUIdx);
117
118  bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
119
120  const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
121
122  void clear();
123
124  typedef SmallVectorImpl<const SCEV *>::iterator iterator;
125  typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator;
126  iterator begin() { return RegSequence.begin(); }
127  iterator end()   { return RegSequence.end(); }
128  const_iterator begin() const { return RegSequence.begin(); }
129  const_iterator end() const   { return RegSequence.end(); }
130};
131
132}
133
134void
135RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) {
136  std::pair<RegUsesTy::iterator, bool> Pair =
137    RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
138  RegSortData &RSD = Pair.first->second;
139  if (Pair.second)
140    RegSequence.push_back(Reg);
141  RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
142  RSD.UsedByIndices.set(LUIdx);
143}
144
145void
146RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) {
147  RegUsesTy::iterator It = RegUsesMap.find(Reg);
148  assert(It != RegUsesMap.end());
149  RegSortData &RSD = It->second;
150  assert(RSD.UsedByIndices.size() > LUIdx);
151  RSD.UsedByIndices.reset(LUIdx);
152}
153
154void
155RegUseTracker::DropUse(size_t LUIdx) {
156  // Remove the use index from every register's use list.
157  for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end();
158       I != E; ++I)
159    I->second.UsedByIndices.reset(LUIdx);
160}
161
162bool
163RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
164  if (!RegUsesMap.count(Reg)) return false;
165  const SmallBitVector &UsedByIndices =
166    RegUsesMap.find(Reg)->second.UsedByIndices;
167  int i = UsedByIndices.find_first();
168  if (i == -1) return false;
169  if ((size_t)i != LUIdx) return true;
170  return UsedByIndices.find_next(i) != -1;
171}
172
173const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
174  RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
175  assert(I != RegUsesMap.end() && "Unknown register!");
176  return I->second.UsedByIndices;
177}
178
179void RegUseTracker::clear() {
180  RegUsesMap.clear();
181  RegSequence.clear();
182}
183
184namespace {
185
186/// Formula - This class holds information that describes a formula for
187/// computing satisfying a use. It may include broken-out immediates and scaled
188/// registers.
189struct Formula {
190  /// AM - This is used to represent complex addressing, as well as other kinds
191  /// of interesting uses.
192  TargetLowering::AddrMode AM;
193
194  /// BaseRegs - The list of "base" registers for this use. When this is
195  /// non-empty, AM.HasBaseReg should be set to true.
196  SmallVector<const SCEV *, 2> BaseRegs;
197
198  /// ScaledReg - The 'scaled' register for this use. This should be non-null
199  /// when AM.Scale is not zero.
200  const SCEV *ScaledReg;
201
202  Formula() : ScaledReg(0) {}
203
204  void InitialMatch(const SCEV *S, Loop *L,
205                    ScalarEvolution &SE, DominatorTree &DT);
206
207  unsigned getNumRegs() const;
208  const Type *getType() const;
209
210  void DeleteBaseReg(const SCEV *&S);
211
212  bool referencesReg(const SCEV *S) const;
213  bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
214                                  const RegUseTracker &RegUses) const;
215
216  void print(raw_ostream &OS) const;
217  void dump() const;
218};
219
220}
221
222/// DoInitialMatch - Recursion helper for InitialMatch.
223static void DoInitialMatch(const SCEV *S, Loop *L,
224                           SmallVectorImpl<const SCEV *> &Good,
225                           SmallVectorImpl<const SCEV *> &Bad,
226                           ScalarEvolution &SE, DominatorTree &DT) {
227  // Collect expressions which properly dominate the loop header.
228  if (S->properlyDominates(L->getHeader(), &DT)) {
229    Good.push_back(S);
230    return;
231  }
232
233  // Look at add operands.
234  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
235    for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
236         I != E; ++I)
237      DoInitialMatch(*I, L, Good, Bad, SE, DT);
238    return;
239  }
240
241  // Look at addrec operands.
242  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
243    if (!AR->getStart()->isZero()) {
244      DoInitialMatch(AR->getStart(), L, Good, Bad, SE, DT);
245      DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
246                                      AR->getStepRecurrence(SE),
247                                      AR->getLoop()),
248                     L, Good, Bad, SE, DT);
249      return;
250    }
251
252  // Handle a multiplication by -1 (negation) if it didn't fold.
253  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
254    if (Mul->getOperand(0)->isAllOnesValue()) {
255      SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
256      const SCEV *NewMul = SE.getMulExpr(Ops);
257
258      SmallVector<const SCEV *, 4> MyGood;
259      SmallVector<const SCEV *, 4> MyBad;
260      DoInitialMatch(NewMul, L, MyGood, MyBad, SE, DT);
261      const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
262        SE.getEffectiveSCEVType(NewMul->getType())));
263      for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(),
264           E = MyGood.end(); I != E; ++I)
265        Good.push_back(SE.getMulExpr(NegOne, *I));
266      for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(),
267           E = MyBad.end(); I != E; ++I)
268        Bad.push_back(SE.getMulExpr(NegOne, *I));
269      return;
270    }
271
272  // Ok, we can't do anything interesting. Just stuff the whole thing into a
273  // register and hope for the best.
274  Bad.push_back(S);
275}
276
277/// InitialMatch - Incorporate loop-variant parts of S into this Formula,
278/// attempting to keep all loop-invariant and loop-computable values in a
279/// single base register.
280void Formula::InitialMatch(const SCEV *S, Loop *L,
281                           ScalarEvolution &SE, DominatorTree &DT) {
282  SmallVector<const SCEV *, 4> Good;
283  SmallVector<const SCEV *, 4> Bad;
284  DoInitialMatch(S, L, Good, Bad, SE, DT);
285  if (!Good.empty()) {
286    const SCEV *Sum = SE.getAddExpr(Good);
287    if (!Sum->isZero())
288      BaseRegs.push_back(Sum);
289    AM.HasBaseReg = true;
290  }
291  if (!Bad.empty()) {
292    const SCEV *Sum = SE.getAddExpr(Bad);
293    if (!Sum->isZero())
294      BaseRegs.push_back(Sum);
295    AM.HasBaseReg = true;
296  }
297}
298
299/// getNumRegs - Return the total number of register operands used by this
300/// formula. This does not include register uses implied by non-constant
301/// addrec strides.
302unsigned Formula::getNumRegs() const {
303  return !!ScaledReg + BaseRegs.size();
304}
305
306/// getType - Return the type of this formula, if it has one, or null
307/// otherwise. This type is meaningless except for the bit size.
308const Type *Formula::getType() const {
309  return !BaseRegs.empty() ? BaseRegs.front()->getType() :
310         ScaledReg ? ScaledReg->getType() :
311         AM.BaseGV ? AM.BaseGV->getType() :
312         0;
313}
314
315/// DeleteBaseReg - Delete the given base reg from the BaseRegs list.
316void Formula::DeleteBaseReg(const SCEV *&S) {
317  if (&S != &BaseRegs.back())
318    std::swap(S, BaseRegs.back());
319  BaseRegs.pop_back();
320}
321
322/// referencesReg - Test if this formula references the given register.
323bool Formula::referencesReg(const SCEV *S) const {
324  return S == ScaledReg ||
325         std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
326}
327
328/// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers
329/// which are used by uses other than the use with the given index.
330bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
331                                         const RegUseTracker &RegUses) const {
332  if (ScaledReg)
333    if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
334      return true;
335  for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
336       E = BaseRegs.end(); I != E; ++I)
337    if (RegUses.isRegUsedByUsesOtherThan(*I, LUIdx))
338      return true;
339  return false;
340}
341
342void Formula::print(raw_ostream &OS) const {
343  bool First = true;
344  if (AM.BaseGV) {
345    if (!First) OS << " + "; else First = false;
346    WriteAsOperand(OS, AM.BaseGV, /*PrintType=*/false);
347  }
348  if (AM.BaseOffs != 0) {
349    if (!First) OS << " + "; else First = false;
350    OS << AM.BaseOffs;
351  }
352  for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
353       E = BaseRegs.end(); I != E; ++I) {
354    if (!First) OS << " + "; else First = false;
355    OS << "reg(" << **I << ')';
356  }
357  if (AM.HasBaseReg && BaseRegs.empty()) {
358    if (!First) OS << " + "; else First = false;
359    OS << "**error: HasBaseReg**";
360  } else if (!AM.HasBaseReg && !BaseRegs.empty()) {
361    if (!First) OS << " + "; else First = false;
362    OS << "**error: !HasBaseReg**";
363  }
364  if (AM.Scale != 0) {
365    if (!First) OS << " + "; else First = false;
366    OS << AM.Scale << "*reg(";
367    if (ScaledReg)
368      OS << *ScaledReg;
369    else
370      OS << "<unknown>";
371    OS << ')';
372  }
373}
374
375void Formula::dump() const {
376  print(errs()); errs() << '\n';
377}
378
379/// isAddRecSExtable - Return true if the given addrec can be sign-extended
380/// without changing its value.
381static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
382  const Type *WideTy =
383    IntegerType::get(SE.getContext(),
384                     SE.getTypeSizeInBits(AR->getType()) + 1);
385  return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
386}
387
388/// isAddSExtable - Return true if the given add can be sign-extended
389/// without changing its value.
390static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
391  const Type *WideTy =
392    IntegerType::get(SE.getContext(),
393                     SE.getTypeSizeInBits(A->getType()) + 1);
394  return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
395}
396
397/// isMulSExtable - Return true if the given add can be sign-extended
398/// without changing its value.
399static bool isMulSExtable(const SCEVMulExpr *A, ScalarEvolution &SE) {
400  const Type *WideTy =
401    IntegerType::get(SE.getContext(),
402                     SE.getTypeSizeInBits(A->getType()) + 1);
403  return isa<SCEVMulExpr>(SE.getSignExtendExpr(A, WideTy));
404}
405
406/// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined
407/// and if the remainder is known to be zero,  or null otherwise. If
408/// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified
409/// to Y, ignoring that the multiplication may overflow, which is useful when
410/// the result will be used in a context where the most significant bits are
411/// ignored.
412static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
413                                ScalarEvolution &SE,
414                                bool IgnoreSignificantBits = false) {
415  // Handle the trivial case, which works for any SCEV type.
416  if (LHS == RHS)
417    return SE.getConstant(LHS->getType(), 1);
418
419  // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do some
420  // folding.
421  if (RHS->isAllOnesValue())
422    return SE.getMulExpr(LHS, RHS);
423
424  // Check for a division of a constant by a constant.
425  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
426    const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
427    if (!RC)
428      return 0;
429    if (C->getValue()->getValue().srem(RC->getValue()->getValue()) != 0)
430      return 0;
431    return SE.getConstant(C->getValue()->getValue()
432               .sdiv(RC->getValue()->getValue()));
433  }
434
435  // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
436  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
437    if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) {
438      const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
439                                       IgnoreSignificantBits);
440      if (!Start) return 0;
441      const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
442                                      IgnoreSignificantBits);
443      if (!Step) return 0;
444      return SE.getAddRecExpr(Start, Step, AR->getLoop());
445    }
446  }
447
448  // Distribute the sdiv over add operands, if the add doesn't overflow.
449  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
450    if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
451      SmallVector<const SCEV *, 8> Ops;
452      for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
453           I != E; ++I) {
454        const SCEV *Op = getExactSDiv(*I, RHS, SE,
455                                      IgnoreSignificantBits);
456        if (!Op) return 0;
457        Ops.push_back(Op);
458      }
459      return SE.getAddExpr(Ops);
460    }
461  }
462
463  // Check for a multiply operand that we can pull RHS out of.
464  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS))
465    if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
466      SmallVector<const SCEV *, 4> Ops;
467      bool Found = false;
468      for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end();
469           I != E; ++I) {
470        const SCEV *S = *I;
471        if (!Found)
472          if (const SCEV *Q = getExactSDiv(S, RHS, SE,
473                                           IgnoreSignificantBits)) {
474            S = Q;
475            Found = true;
476          }
477        Ops.push_back(S);
478      }
479      return Found ? SE.getMulExpr(Ops) : 0;
480    }
481
482  // Otherwise we don't know.
483  return 0;
484}
485
486/// ExtractImmediate - If S involves the addition of a constant integer value,
487/// return that integer value, and mutate S to point to a new SCEV with that
488/// value excluded.
489static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
490  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
491    if (C->getValue()->getValue().getMinSignedBits() <= 64) {
492      S = SE.getConstant(C->getType(), 0);
493      return C->getValue()->getSExtValue();
494    }
495  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
496    SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
497    int64_t Result = ExtractImmediate(NewOps.front(), SE);
498    S = SE.getAddExpr(NewOps);
499    return Result;
500  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
501    SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
502    int64_t Result = ExtractImmediate(NewOps.front(), SE);
503    S = SE.getAddRecExpr(NewOps, AR->getLoop());
504    return Result;
505  }
506  return 0;
507}
508
509/// ExtractSymbol - If S involves the addition of a GlobalValue address,
510/// return that symbol, and mutate S to point to a new SCEV with that
511/// value excluded.
512static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
513  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
514    if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
515      S = SE.getConstant(GV->getType(), 0);
516      return GV;
517    }
518  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
519    SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
520    GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
521    S = SE.getAddExpr(NewOps);
522    return Result;
523  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
524    SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
525    GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
526    S = SE.getAddRecExpr(NewOps, AR->getLoop());
527    return Result;
528  }
529  return 0;
530}
531
532/// isAddressUse - Returns true if the specified instruction is using the
533/// specified value as an address.
534static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
535  bool isAddress = isa<LoadInst>(Inst);
536  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
537    if (SI->getOperand(1) == OperandVal)
538      isAddress = true;
539  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
540    // Addressing modes can also be folded into prefetches and a variety
541    // of intrinsics.
542    switch (II->getIntrinsicID()) {
543      default: break;
544      case Intrinsic::prefetch:
545      case Intrinsic::x86_sse2_loadu_dq:
546      case Intrinsic::x86_sse2_loadu_pd:
547      case Intrinsic::x86_sse_loadu_ps:
548      case Intrinsic::x86_sse_storeu_ps:
549      case Intrinsic::x86_sse2_storeu_pd:
550      case Intrinsic::x86_sse2_storeu_dq:
551      case Intrinsic::x86_sse2_storel_dq:
552        if (II->getOperand(1) == OperandVal)
553          isAddress = true;
554        break;
555    }
556  }
557  return isAddress;
558}
559
560/// getAccessType - Return the type of the memory being accessed.
561static const Type *getAccessType(const Instruction *Inst) {
562  const Type *AccessTy = Inst->getType();
563  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
564    AccessTy = SI->getOperand(0)->getType();
565  else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
566    // Addressing modes can also be folded into prefetches and a variety
567    // of intrinsics.
568    switch (II->getIntrinsicID()) {
569    default: break;
570    case Intrinsic::x86_sse_storeu_ps:
571    case Intrinsic::x86_sse2_storeu_pd:
572    case Intrinsic::x86_sse2_storeu_dq:
573    case Intrinsic::x86_sse2_storel_dq:
574      AccessTy = II->getOperand(1)->getType();
575      break;
576    }
577  }
578
579  // All pointers have the same requirements, so canonicalize them to an
580  // arbitrary pointer type to minimize variation.
581  if (const PointerType *PTy = dyn_cast<PointerType>(AccessTy))
582    AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
583                                PTy->getAddressSpace());
584
585  return AccessTy;
586}
587
588/// DeleteTriviallyDeadInstructions - If any of the instructions is the
589/// specified set are trivially dead, delete them and see if this makes any of
590/// their operands subsequently dead.
591static bool
592DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
593  bool Changed = false;
594
595  while (!DeadInsts.empty()) {
596    Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
597
598    if (I == 0 || !isInstructionTriviallyDead(I))
599      continue;
600
601    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
602      if (Instruction *U = dyn_cast<Instruction>(*OI)) {
603        *OI = 0;
604        if (U->use_empty())
605          DeadInsts.push_back(U);
606      }
607
608    I->eraseFromParent();
609    Changed = true;
610  }
611
612  return Changed;
613}
614
615namespace {
616
617/// Cost - This class is used to measure and compare candidate formulae.
618class Cost {
619  /// TODO: Some of these could be merged. Also, a lexical ordering
620  /// isn't always optimal.
621  unsigned NumRegs;
622  unsigned AddRecCost;
623  unsigned NumIVMuls;
624  unsigned NumBaseAdds;
625  unsigned ImmCost;
626  unsigned SetupCost;
627
628public:
629  Cost()
630    : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
631      SetupCost(0) {}
632
633  unsigned getNumRegs() const { return NumRegs; }
634
635  bool operator<(const Cost &Other) const;
636
637  void Loose();
638
639  void RateFormula(const Formula &F,
640                   SmallPtrSet<const SCEV *, 16> &Regs,
641                   const DenseSet<const SCEV *> &VisitedRegs,
642                   const Loop *L,
643                   const SmallVectorImpl<int64_t> &Offsets,
644                   ScalarEvolution &SE, DominatorTree &DT);
645
646  void print(raw_ostream &OS) const;
647  void dump() const;
648
649private:
650  void RateRegister(const SCEV *Reg,
651                    SmallPtrSet<const SCEV *, 16> &Regs,
652                    const Loop *L,
653                    ScalarEvolution &SE, DominatorTree &DT);
654  void RatePrimaryRegister(const SCEV *Reg,
655                           SmallPtrSet<const SCEV *, 16> &Regs,
656                           const Loop *L,
657                           ScalarEvolution &SE, DominatorTree &DT);
658};
659
660}
661
662/// RateRegister - Tally up interesting quantities from the given register.
663void Cost::RateRegister(const SCEV *Reg,
664                        SmallPtrSet<const SCEV *, 16> &Regs,
665                        const Loop *L,
666                        ScalarEvolution &SE, DominatorTree &DT) {
667  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
668    if (AR->getLoop() == L)
669      AddRecCost += 1; /// TODO: This should be a function of the stride.
670
671    // If this is an addrec for a loop that's already been visited by LSR,
672    // don't second-guess its addrec phi nodes. LSR isn't currently smart
673    // enough to reason about more than one loop at a time. Consider these
674    // registers free and leave them alone.
675    else if (L->contains(AR->getLoop()) ||
676             (!AR->getLoop()->contains(L) &&
677              DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) {
678      for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
679           PHINode *PN = dyn_cast<PHINode>(I); ++I)
680        if (SE.isSCEVable(PN->getType()) &&
681            (SE.getEffectiveSCEVType(PN->getType()) ==
682             SE.getEffectiveSCEVType(AR->getType())) &&
683            SE.getSCEV(PN) == AR)
684          return;
685
686      // If this isn't one of the addrecs that the loop already has, it
687      // would require a costly new phi and add. TODO: This isn't
688      // precisely modeled right now.
689      ++NumBaseAdds;
690      if (!Regs.count(AR->getStart()))
691        RateRegister(AR->getStart(), Regs, L, SE, DT);
692    }
693
694    // Add the step value register, if it needs one.
695    // TODO: The non-affine case isn't precisely modeled here.
696    if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1)))
697      if (!Regs.count(AR->getStart()))
698        RateRegister(AR->getOperand(1), Regs, L, SE, DT);
699  }
700  ++NumRegs;
701
702  // Rough heuristic; favor registers which don't require extra setup
703  // instructions in the preheader.
704  if (!isa<SCEVUnknown>(Reg) &&
705      !isa<SCEVConstant>(Reg) &&
706      !(isa<SCEVAddRecExpr>(Reg) &&
707        (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
708         isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
709    ++SetupCost;
710}
711
712/// RatePrimaryRegister - Record this register in the set. If we haven't seen it
713/// before, rate it.
714void Cost::RatePrimaryRegister(const SCEV *Reg,
715                               SmallPtrSet<const SCEV *, 16> &Regs,
716                               const Loop *L,
717                               ScalarEvolution &SE, DominatorTree &DT) {
718  if (Regs.insert(Reg))
719    RateRegister(Reg, Regs, L, SE, DT);
720}
721
722void Cost::RateFormula(const Formula &F,
723                       SmallPtrSet<const SCEV *, 16> &Regs,
724                       const DenseSet<const SCEV *> &VisitedRegs,
725                       const Loop *L,
726                       const SmallVectorImpl<int64_t> &Offsets,
727                       ScalarEvolution &SE, DominatorTree &DT) {
728  // Tally up the registers.
729  if (const SCEV *ScaledReg = F.ScaledReg) {
730    if (VisitedRegs.count(ScaledReg)) {
731      Loose();
732      return;
733    }
734    RatePrimaryRegister(ScaledReg, Regs, L, SE, DT);
735  }
736  for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
737       E = F.BaseRegs.end(); I != E; ++I) {
738    const SCEV *BaseReg = *I;
739    if (VisitedRegs.count(BaseReg)) {
740      Loose();
741      return;
742    }
743    RatePrimaryRegister(BaseReg, Regs, L, SE, DT);
744
745    NumIVMuls += isa<SCEVMulExpr>(BaseReg) &&
746                 BaseReg->hasComputableLoopEvolution(L);
747  }
748
749  if (F.BaseRegs.size() > 1)
750    NumBaseAdds += F.BaseRegs.size() - 1;
751
752  // Tally up the non-zero immediates.
753  for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
754       E = Offsets.end(); I != E; ++I) {
755    int64_t Offset = (uint64_t)*I + F.AM.BaseOffs;
756    if (F.AM.BaseGV)
757      ImmCost += 64; // Handle symbolic values conservatively.
758                     // TODO: This should probably be the pointer size.
759    else if (Offset != 0)
760      ImmCost += APInt(64, Offset, true).getMinSignedBits();
761  }
762}
763
764/// Loose - Set this cost to a loosing value.
765void Cost::Loose() {
766  NumRegs = ~0u;
767  AddRecCost = ~0u;
768  NumIVMuls = ~0u;
769  NumBaseAdds = ~0u;
770  ImmCost = ~0u;
771  SetupCost = ~0u;
772}
773
774/// operator< - Choose the lower cost.
775bool Cost::operator<(const Cost &Other) const {
776  if (NumRegs != Other.NumRegs)
777    return NumRegs < Other.NumRegs;
778  if (AddRecCost != Other.AddRecCost)
779    return AddRecCost < Other.AddRecCost;
780  if (NumIVMuls != Other.NumIVMuls)
781    return NumIVMuls < Other.NumIVMuls;
782  if (NumBaseAdds != Other.NumBaseAdds)
783    return NumBaseAdds < Other.NumBaseAdds;
784  if (ImmCost != Other.ImmCost)
785    return ImmCost < Other.ImmCost;
786  if (SetupCost != Other.SetupCost)
787    return SetupCost < Other.SetupCost;
788  return false;
789}
790
791void Cost::print(raw_ostream &OS) const {
792  OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
793  if (AddRecCost != 0)
794    OS << ", with addrec cost " << AddRecCost;
795  if (NumIVMuls != 0)
796    OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
797  if (NumBaseAdds != 0)
798    OS << ", plus " << NumBaseAdds << " base add"
799       << (NumBaseAdds == 1 ? "" : "s");
800  if (ImmCost != 0)
801    OS << ", plus " << ImmCost << " imm cost";
802  if (SetupCost != 0)
803    OS << ", plus " << SetupCost << " setup cost";
804}
805
806void Cost::dump() const {
807  print(errs()); errs() << '\n';
808}
809
810namespace {
811
812/// LSRFixup - An operand value in an instruction which is to be replaced
813/// with some equivalent, possibly strength-reduced, replacement.
814struct LSRFixup {
815  /// UserInst - The instruction which will be updated.
816  Instruction *UserInst;
817
818  /// OperandValToReplace - The operand of the instruction which will
819  /// be replaced. The operand may be used more than once; every instance
820  /// will be replaced.
821  Value *OperandValToReplace;
822
823  /// PostIncLoops - If this user is to use the post-incremented value of an
824  /// induction variable, this variable is non-null and holds the loop
825  /// associated with the induction variable.
826  PostIncLoopSet PostIncLoops;
827
828  /// LUIdx - The index of the LSRUse describing the expression which
829  /// this fixup needs, minus an offset (below).
830  size_t LUIdx;
831
832  /// Offset - A constant offset to be added to the LSRUse expression.
833  /// This allows multiple fixups to share the same LSRUse with different
834  /// offsets, for example in an unrolled loop.
835  int64_t Offset;
836
837  bool isUseFullyOutsideLoop(const Loop *L) const;
838
839  LSRFixup();
840
841  void print(raw_ostream &OS) const;
842  void dump() const;
843};
844
845}
846
847LSRFixup::LSRFixup()
848  : UserInst(0), OperandValToReplace(0),
849    LUIdx(~size_t(0)), Offset(0) {}
850
851/// isUseFullyOutsideLoop - Test whether this fixup always uses its
852/// value outside of the given loop.
853bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
854  // PHI nodes use their value in their incoming blocks.
855  if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
856    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
857      if (PN->getIncomingValue(i) == OperandValToReplace &&
858          L->contains(PN->getIncomingBlock(i)))
859        return false;
860    return true;
861  }
862
863  return !L->contains(UserInst);
864}
865
866void LSRFixup::print(raw_ostream &OS) const {
867  OS << "UserInst=";
868  // Store is common and interesting enough to be worth special-casing.
869  if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
870    OS << "store ";
871    WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false);
872  } else if (UserInst->getType()->isVoidTy())
873    OS << UserInst->getOpcodeName();
874  else
875    WriteAsOperand(OS, UserInst, /*PrintType=*/false);
876
877  OS << ", OperandValToReplace=";
878  WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false);
879
880  for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(),
881       E = PostIncLoops.end(); I != E; ++I) {
882    OS << ", PostIncLoop=";
883    WriteAsOperand(OS, (*I)->getHeader(), /*PrintType=*/false);
884  }
885
886  if (LUIdx != ~size_t(0))
887    OS << ", LUIdx=" << LUIdx;
888
889  if (Offset != 0)
890    OS << ", Offset=" << Offset;
891}
892
893void LSRFixup::dump() const {
894  print(errs()); errs() << '\n';
895}
896
897namespace {
898
899/// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding
900/// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*.
901struct UniquifierDenseMapInfo {
902  static SmallVector<const SCEV *, 2> getEmptyKey() {
903    SmallVector<const SCEV *, 2> V;
904    V.push_back(reinterpret_cast<const SCEV *>(-1));
905    return V;
906  }
907
908  static SmallVector<const SCEV *, 2> getTombstoneKey() {
909    SmallVector<const SCEV *, 2> V;
910    V.push_back(reinterpret_cast<const SCEV *>(-2));
911    return V;
912  }
913
914  static unsigned getHashValue(const SmallVector<const SCEV *, 2> &V) {
915    unsigned Result = 0;
916    for (SmallVectorImpl<const SCEV *>::const_iterator I = V.begin(),
917         E = V.end(); I != E; ++I)
918      Result ^= DenseMapInfo<const SCEV *>::getHashValue(*I);
919    return Result;
920  }
921
922  static bool isEqual(const SmallVector<const SCEV *, 2> &LHS,
923                      const SmallVector<const SCEV *, 2> &RHS) {
924    return LHS == RHS;
925  }
926};
927
928/// LSRUse - This class holds the state that LSR keeps for each use in
929/// IVUsers, as well as uses invented by LSR itself. It includes information
930/// about what kinds of things can be folded into the user, information about
931/// the user itself, and information about how the use may be satisfied.
932/// TODO: Represent multiple users of the same expression in common?
933class LSRUse {
934  DenseSet<SmallVector<const SCEV *, 2>, UniquifierDenseMapInfo> Uniquifier;
935
936public:
937  /// KindType - An enum for a kind of use, indicating what types of
938  /// scaled and immediate operands it might support.
939  enum KindType {
940    Basic,   ///< A normal use, with no folding.
941    Special, ///< A special case of basic, allowing -1 scales.
942    Address, ///< An address use; folding according to TargetLowering
943    ICmpZero ///< An equality icmp with both operands folded into one.
944    // TODO: Add a generic icmp too?
945  };
946
947  KindType Kind;
948  const Type *AccessTy;
949
950  SmallVector<int64_t, 8> Offsets;
951  int64_t MinOffset;
952  int64_t MaxOffset;
953
954  /// AllFixupsOutsideLoop - This records whether all of the fixups using this
955  /// LSRUse are outside of the loop, in which case some special-case heuristics
956  /// may be used.
957  bool AllFixupsOutsideLoop;
958
959  /// Formulae - A list of ways to build a value that can satisfy this user.
960  /// After the list is populated, one of these is selected heuristically and
961  /// used to formulate a replacement for OperandValToReplace in UserInst.
962  SmallVector<Formula, 12> Formulae;
963
964  /// Regs - The set of register candidates used by all formulae in this LSRUse.
965  SmallPtrSet<const SCEV *, 4> Regs;
966
967  LSRUse(KindType K, const Type *T) : Kind(K), AccessTy(T),
968                                      MinOffset(INT64_MAX),
969                                      MaxOffset(INT64_MIN),
970                                      AllFixupsOutsideLoop(true) {}
971
972  bool HasFormulaWithSameRegs(const Formula &F) const;
973  bool InsertFormula(const Formula &F);
974  void DeleteFormula(Formula &F);
975  void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
976
977  void check() const;
978
979  void print(raw_ostream &OS) const;
980  void dump() const;
981};
982
983/// HasFormula - Test whether this use as a formula which has the same
984/// registers as the given formula.
985bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
986  SmallVector<const SCEV *, 2> Key = F.BaseRegs;
987  if (F.ScaledReg) Key.push_back(F.ScaledReg);
988  // Unstable sort by host order ok, because this is only used for uniquifying.
989  std::sort(Key.begin(), Key.end());
990  return Uniquifier.count(Key);
991}
992
993/// InsertFormula - If the given formula has not yet been inserted, add it to
994/// the list, and return true. Return false otherwise.
995bool LSRUse::InsertFormula(const Formula &F) {
996  SmallVector<const SCEV *, 2> Key = F.BaseRegs;
997  if (F.ScaledReg) Key.push_back(F.ScaledReg);
998  // Unstable sort by host order ok, because this is only used for uniquifying.
999  std::sort(Key.begin(), Key.end());
1000
1001  if (!Uniquifier.insert(Key).second)
1002    return false;
1003
1004  // Using a register to hold the value of 0 is not profitable.
1005  assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1006         "Zero allocated in a scaled register!");
1007#ifndef NDEBUG
1008  for (SmallVectorImpl<const SCEV *>::const_iterator I =
1009       F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I)
1010    assert(!(*I)->isZero() && "Zero allocated in a base register!");
1011#endif
1012
1013  // Add the formula to the list.
1014  Formulae.push_back(F);
1015
1016  // Record registers now being used by this use.
1017  if (F.ScaledReg) Regs.insert(F.ScaledReg);
1018  Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1019
1020  return true;
1021}
1022
1023/// DeleteFormula - Remove the given formula from this use's list.
1024void LSRUse::DeleteFormula(Formula &F) {
1025  if (&F != &Formulae.back())
1026    std::swap(F, Formulae.back());
1027  Formulae.pop_back();
1028  assert(!Formulae.empty() && "LSRUse has no formulae left!");
1029}
1030
1031/// RecomputeRegs - Recompute the Regs field, and update RegUses.
1032void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1033  // Now that we've filtered out some formulae, recompute the Regs set.
1034  SmallPtrSet<const SCEV *, 4> OldRegs = Regs;
1035  Regs.clear();
1036  for (size_t FIdx = 0, NumForms = Formulae.size(); FIdx != NumForms; ++FIdx) {
1037    Formula &F = Formulae[FIdx];
1038    if (F.ScaledReg) Regs.insert(F.ScaledReg);
1039    Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1040  }
1041
1042  // Update the RegTracker.
1043  for (SmallPtrSet<const SCEV *, 4>::iterator I = OldRegs.begin(),
1044       E = OldRegs.end(); I != E; ++I)
1045    if (!Regs.count(*I))
1046      RegUses.DropRegister(*I, LUIdx);
1047}
1048
1049void LSRUse::print(raw_ostream &OS) const {
1050  OS << "LSR Use: Kind=";
1051  switch (Kind) {
1052  case Basic:    OS << "Basic"; break;
1053  case Special:  OS << "Special"; break;
1054  case ICmpZero: OS << "ICmpZero"; break;
1055  case Address:
1056    OS << "Address of ";
1057    if (AccessTy->isPointerTy())
1058      OS << "pointer"; // the full pointer type could be really verbose
1059    else
1060      OS << *AccessTy;
1061  }
1062
1063  OS << ", Offsets={";
1064  for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
1065       E = Offsets.end(); I != E; ++I) {
1066    OS << *I;
1067    if (next(I) != E)
1068      OS << ',';
1069  }
1070  OS << '}';
1071
1072  if (AllFixupsOutsideLoop)
1073    OS << ", all-fixups-outside-loop";
1074}
1075
1076void LSRUse::dump() const {
1077  print(errs()); errs() << '\n';
1078}
1079
1080/// isLegalUse - Test whether the use described by AM is "legal", meaning it can
1081/// be completely folded into the user instruction at isel time. This includes
1082/// address-mode folding and special icmp tricks.
1083static bool isLegalUse(const TargetLowering::AddrMode &AM,
1084                       LSRUse::KindType Kind, const Type *AccessTy,
1085                       const TargetLowering *TLI) {
1086  switch (Kind) {
1087  case LSRUse::Address:
1088    // If we have low-level target information, ask the target if it can
1089    // completely fold this address.
1090    if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy);
1091
1092    // Otherwise, just guess that reg+reg addressing is legal.
1093    return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1;
1094
1095  case LSRUse::ICmpZero:
1096    // There's not even a target hook for querying whether it would be legal to
1097    // fold a GV into an ICmp.
1098    if (AM.BaseGV)
1099      return false;
1100
1101    // ICmp only has two operands; don't allow more than two non-trivial parts.
1102    if (AM.Scale != 0 && AM.HasBaseReg && AM.BaseOffs != 0)
1103      return false;
1104
1105    // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1106    // putting the scaled register in the other operand of the icmp.
1107    if (AM.Scale != 0 && AM.Scale != -1)
1108      return false;
1109
1110    // If we have low-level target information, ask the target if it can fold an
1111    // integer immediate on an icmp.
1112    if (AM.BaseOffs != 0) {
1113      if (TLI) return TLI->isLegalICmpImmediate(-AM.BaseOffs);
1114      return false;
1115    }
1116
1117    return true;
1118
1119  case LSRUse::Basic:
1120    // Only handle single-register values.
1121    return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0;
1122
1123  case LSRUse::Special:
1124    // Only handle -1 scales, or no scale.
1125    return AM.Scale == 0 || AM.Scale == -1;
1126  }
1127
1128  return false;
1129}
1130
1131static bool isLegalUse(TargetLowering::AddrMode AM,
1132                       int64_t MinOffset, int64_t MaxOffset,
1133                       LSRUse::KindType Kind, const Type *AccessTy,
1134                       const TargetLowering *TLI) {
1135  // Check for overflow.
1136  if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) !=
1137      (MinOffset > 0))
1138    return false;
1139  AM.BaseOffs = (uint64_t)AM.BaseOffs + MinOffset;
1140  if (isLegalUse(AM, Kind, AccessTy, TLI)) {
1141    AM.BaseOffs = (uint64_t)AM.BaseOffs - MinOffset;
1142    // Check for overflow.
1143    if (((int64_t)((uint64_t)AM.BaseOffs + MaxOffset) > AM.BaseOffs) !=
1144        (MaxOffset > 0))
1145      return false;
1146    AM.BaseOffs = (uint64_t)AM.BaseOffs + MaxOffset;
1147    return isLegalUse(AM, Kind, AccessTy, TLI);
1148  }
1149  return false;
1150}
1151
1152static bool isAlwaysFoldable(int64_t BaseOffs,
1153                             GlobalValue *BaseGV,
1154                             bool HasBaseReg,
1155                             LSRUse::KindType Kind, const Type *AccessTy,
1156                             const TargetLowering *TLI) {
1157  // Fast-path: zero is always foldable.
1158  if (BaseOffs == 0 && !BaseGV) return true;
1159
1160  // Conservatively, create an address with an immediate and a
1161  // base and a scale.
1162  TargetLowering::AddrMode AM;
1163  AM.BaseOffs = BaseOffs;
1164  AM.BaseGV = BaseGV;
1165  AM.HasBaseReg = HasBaseReg;
1166  AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1167
1168  // Canonicalize a scale of 1 to a base register if the formula doesn't
1169  // already have a base register.
1170  if (!AM.HasBaseReg && AM.Scale == 1) {
1171    AM.Scale = 0;
1172    AM.HasBaseReg = true;
1173  }
1174
1175  return isLegalUse(AM, Kind, AccessTy, TLI);
1176}
1177
1178static bool isAlwaysFoldable(const SCEV *S,
1179                             int64_t MinOffset, int64_t MaxOffset,
1180                             bool HasBaseReg,
1181                             LSRUse::KindType Kind, const Type *AccessTy,
1182                             const TargetLowering *TLI,
1183                             ScalarEvolution &SE) {
1184  // Fast-path: zero is always foldable.
1185  if (S->isZero()) return true;
1186
1187  // Conservatively, create an address with an immediate and a
1188  // base and a scale.
1189  int64_t BaseOffs = ExtractImmediate(S, SE);
1190  GlobalValue *BaseGV = ExtractSymbol(S, SE);
1191
1192  // If there's anything else involved, it's not foldable.
1193  if (!S->isZero()) return false;
1194
1195  // Fast-path: zero is always foldable.
1196  if (BaseOffs == 0 && !BaseGV) return true;
1197
1198  // Conservatively, create an address with an immediate and a
1199  // base and a scale.
1200  TargetLowering::AddrMode AM;
1201  AM.BaseOffs = BaseOffs;
1202  AM.BaseGV = BaseGV;
1203  AM.HasBaseReg = HasBaseReg;
1204  AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1205
1206  return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI);
1207}
1208
1209/// FormulaSorter - This class implements an ordering for formulae which sorts
1210/// the by their standalone cost.
1211class FormulaSorter {
1212  /// These two sets are kept empty, so that we compute standalone costs.
1213  DenseSet<const SCEV *> VisitedRegs;
1214  SmallPtrSet<const SCEV *, 16> Regs;
1215  Loop *L;
1216  LSRUse *LU;
1217  ScalarEvolution &SE;
1218  DominatorTree &DT;
1219
1220public:
1221  FormulaSorter(Loop *l, LSRUse &lu, ScalarEvolution &se, DominatorTree &dt)
1222    : L(l), LU(&lu), SE(se), DT(dt) {}
1223
1224  bool operator()(const Formula &A, const Formula &B) {
1225    Cost CostA;
1226    CostA.RateFormula(A, Regs, VisitedRegs, L, LU->Offsets, SE, DT);
1227    Regs.clear();
1228    Cost CostB;
1229    CostB.RateFormula(B, Regs, VisitedRegs, L, LU->Offsets, SE, DT);
1230    Regs.clear();
1231    return CostA < CostB;
1232  }
1233};
1234
1235/// LSRInstance - This class holds state for the main loop strength reduction
1236/// logic.
1237class LSRInstance {
1238  IVUsers &IU;
1239  ScalarEvolution &SE;
1240  DominatorTree &DT;
1241  LoopInfo &LI;
1242  const TargetLowering *const TLI;
1243  Loop *const L;
1244  bool Changed;
1245
1246  /// IVIncInsertPos - This is the insert position that the current loop's
1247  /// induction variable increment should be placed. In simple loops, this is
1248  /// the latch block's terminator. But in more complicated cases, this is a
1249  /// position which will dominate all the in-loop post-increment users.
1250  Instruction *IVIncInsertPos;
1251
1252  /// Factors - Interesting factors between use strides.
1253  SmallSetVector<int64_t, 8> Factors;
1254
1255  /// Types - Interesting use types, to facilitate truncation reuse.
1256  SmallSetVector<const Type *, 4> Types;
1257
1258  /// Fixups - The list of operands which are to be replaced.
1259  SmallVector<LSRFixup, 16> Fixups;
1260
1261  /// Uses - The list of interesting uses.
1262  SmallVector<LSRUse, 16> Uses;
1263
1264  /// RegUses - Track which uses use which register candidates.
1265  RegUseTracker RegUses;
1266
1267  void OptimizeShadowIV();
1268  bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
1269  ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
1270  bool OptimizeLoopTermCond();
1271
1272  void CollectInterestingTypesAndFactors();
1273  void CollectFixupsAndInitialFormulae();
1274
1275  LSRFixup &getNewFixup() {
1276    Fixups.push_back(LSRFixup());
1277    return Fixups.back();
1278  }
1279
1280  // Support for sharing of LSRUses between LSRFixups.
1281  typedef DenseMap<const SCEV *, size_t> UseMapTy;
1282  UseMapTy UseMap;
1283
1284  bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
1285                          bool HasBaseReg,
1286                          LSRUse::KindType Kind, const Type *AccessTy);
1287
1288  std::pair<size_t, int64_t> getUse(const SCEV *&Expr,
1289                                    LSRUse::KindType Kind,
1290                                    const Type *AccessTy);
1291
1292  void DeleteUse(LSRUse &LU);
1293
1294  LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
1295
1296public:
1297  void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1298  void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1299  void CountRegisters(const Formula &F, size_t LUIdx);
1300  bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
1301
1302  void CollectLoopInvariantFixupsAndFormulae();
1303
1304  void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
1305                              unsigned Depth = 0);
1306  void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
1307  void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
1308  void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
1309  void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
1310  void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
1311  void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
1312  void GenerateCrossUseConstantOffsets();
1313  void GenerateAllReuseFormulae();
1314
1315  void FilterOutUndesirableDedicatedRegisters();
1316
1317  size_t EstimateSearchSpaceComplexity() const;
1318  void NarrowSearchSpaceUsingHeuristics();
1319
1320  void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
1321                    Cost &SolutionCost,
1322                    SmallVectorImpl<const Formula *> &Workspace,
1323                    const Cost &CurCost,
1324                    const SmallPtrSet<const SCEV *, 16> &CurRegs,
1325                    DenseSet<const SCEV *> &VisitedRegs) const;
1326  void Solve(SmallVectorImpl<const Formula *> &Solution) const;
1327
1328  BasicBlock::iterator
1329    HoistInsertPosition(BasicBlock::iterator IP,
1330                        const SmallVectorImpl<Instruction *> &Inputs) const;
1331  BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
1332                                                     const LSRFixup &LF,
1333                                                     const LSRUse &LU) const;
1334
1335  Value *Expand(const LSRFixup &LF,
1336                const Formula &F,
1337                BasicBlock::iterator IP,
1338                SCEVExpander &Rewriter,
1339                SmallVectorImpl<WeakVH> &DeadInsts) const;
1340  void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
1341                     const Formula &F,
1342                     SCEVExpander &Rewriter,
1343                     SmallVectorImpl<WeakVH> &DeadInsts,
1344                     Pass *P) const;
1345  void Rewrite(const LSRFixup &LF,
1346               const Formula &F,
1347               SCEVExpander &Rewriter,
1348               SmallVectorImpl<WeakVH> &DeadInsts,
1349               Pass *P) const;
1350  void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
1351                         Pass *P);
1352
1353  LSRInstance(const TargetLowering *tli, Loop *l, Pass *P);
1354
1355  bool getChanged() const { return Changed; }
1356
1357  void print_factors_and_types(raw_ostream &OS) const;
1358  void print_fixups(raw_ostream &OS) const;
1359  void print_uses(raw_ostream &OS) const;
1360  void print(raw_ostream &OS) const;
1361  void dump() const;
1362};
1363
1364}
1365
1366/// OptimizeShadowIV - If IV is used in a int-to-float cast
1367/// inside the loop then try to eliminate the cast operation.
1368void LSRInstance::OptimizeShadowIV() {
1369  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1370  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1371    return;
1372
1373  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
1374       UI != E; /* empty */) {
1375    IVUsers::const_iterator CandidateUI = UI;
1376    ++UI;
1377    Instruction *ShadowUse = CandidateUI->getUser();
1378    const Type *DestTy = NULL;
1379
1380    /* If shadow use is a int->float cast then insert a second IV
1381       to eliminate this cast.
1382
1383         for (unsigned i = 0; i < n; ++i)
1384           foo((double)i);
1385
1386       is transformed into
1387
1388         double d = 0.0;
1389         for (unsigned i = 0; i < n; ++i, ++d)
1390           foo(d);
1391    */
1392    if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser()))
1393      DestTy = UCast->getDestTy();
1394    else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser()))
1395      DestTy = SCast->getDestTy();
1396    if (!DestTy) continue;
1397
1398    if (TLI) {
1399      // If target does not support DestTy natively then do not apply
1400      // this transformation.
1401      EVT DVT = TLI->getValueType(DestTy);
1402      if (!TLI->isTypeLegal(DVT)) continue;
1403    }
1404
1405    PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
1406    if (!PH) continue;
1407    if (PH->getNumIncomingValues() != 2) continue;
1408
1409    const Type *SrcTy = PH->getType();
1410    int Mantissa = DestTy->getFPMantissaWidth();
1411    if (Mantissa == -1) continue;
1412    if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
1413      continue;
1414
1415    unsigned Entry, Latch;
1416    if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
1417      Entry = 0;
1418      Latch = 1;
1419    } else {
1420      Entry = 1;
1421      Latch = 0;
1422    }
1423
1424    ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
1425    if (!Init) continue;
1426    Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue());
1427
1428    BinaryOperator *Incr =
1429      dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
1430    if (!Incr) continue;
1431    if (Incr->getOpcode() != Instruction::Add
1432        && Incr->getOpcode() != Instruction::Sub)
1433      continue;
1434
1435    /* Initialize new IV, double d = 0.0 in above example. */
1436    ConstantInt *C = NULL;
1437    if (Incr->getOperand(0) == PH)
1438      C = dyn_cast<ConstantInt>(Incr->getOperand(1));
1439    else if (Incr->getOperand(1) == PH)
1440      C = dyn_cast<ConstantInt>(Incr->getOperand(0));
1441    else
1442      continue;
1443
1444    if (!C) continue;
1445
1446    // Ignore negative constants, as the code below doesn't handle them
1447    // correctly. TODO: Remove this restriction.
1448    if (!C->getValue().isStrictlyPositive()) continue;
1449
1450    /* Add new PHINode. */
1451    PHINode *NewPH = PHINode::Create(DestTy, "IV.S.", PH);
1452
1453    /* create new increment. '++d' in above example. */
1454    Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
1455    BinaryOperator *NewIncr =
1456      BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
1457                               Instruction::FAdd : Instruction::FSub,
1458                             NewPH, CFP, "IV.S.next.", Incr);
1459
1460    NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
1461    NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
1462
1463    /* Remove cast operation */
1464    ShadowUse->replaceAllUsesWith(NewPH);
1465    ShadowUse->eraseFromParent();
1466    break;
1467  }
1468}
1469
1470/// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
1471/// set the IV user and stride information and return true, otherwise return
1472/// false.
1473bool LSRInstance::FindIVUserForCond(ICmpInst *Cond,
1474                                    IVStrideUse *&CondUse) {
1475  for (IVUsers::iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
1476    if (UI->getUser() == Cond) {
1477      // NOTE: we could handle setcc instructions with multiple uses here, but
1478      // InstCombine does it as well for simple uses, it's not clear that it
1479      // occurs enough in real life to handle.
1480      CondUse = UI;
1481      return true;
1482    }
1483  return false;
1484}
1485
1486/// OptimizeMax - Rewrite the loop's terminating condition if it uses
1487/// a max computation.
1488///
1489/// This is a narrow solution to a specific, but acute, problem. For loops
1490/// like this:
1491///
1492///   i = 0;
1493///   do {
1494///     p[i] = 0.0;
1495///   } while (++i < n);
1496///
1497/// the trip count isn't just 'n', because 'n' might not be positive. And
1498/// unfortunately this can come up even for loops where the user didn't use
1499/// a C do-while loop. For example, seemingly well-behaved top-test loops
1500/// will commonly be lowered like this:
1501//
1502///   if (n > 0) {
1503///     i = 0;
1504///     do {
1505///       p[i] = 0.0;
1506///     } while (++i < n);
1507///   }
1508///
1509/// and then it's possible for subsequent optimization to obscure the if
1510/// test in such a way that indvars can't find it.
1511///
1512/// When indvars can't find the if test in loops like this, it creates a
1513/// max expression, which allows it to give the loop a canonical
1514/// induction variable:
1515///
1516///   i = 0;
1517///   max = n < 1 ? 1 : n;
1518///   do {
1519///     p[i] = 0.0;
1520///   } while (++i != max);
1521///
1522/// Canonical induction variables are necessary because the loop passes
1523/// are designed around them. The most obvious example of this is the
1524/// LoopInfo analysis, which doesn't remember trip count values. It
1525/// expects to be able to rediscover the trip count each time it is
1526/// needed, and it does this using a simple analysis that only succeeds if
1527/// the loop has a canonical induction variable.
1528///
1529/// However, when it comes time to generate code, the maximum operation
1530/// can be quite costly, especially if it's inside of an outer loop.
1531///
1532/// This function solves this problem by detecting this type of loop and
1533/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
1534/// the instructions for the maximum computation.
1535///
1536ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
1537  // Check that the loop matches the pattern we're looking for.
1538  if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
1539      Cond->getPredicate() != CmpInst::ICMP_NE)
1540    return Cond;
1541
1542  SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
1543  if (!Sel || !Sel->hasOneUse()) return Cond;
1544
1545  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1546  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1547    return Cond;
1548  const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
1549
1550  // Add one to the backedge-taken count to get the trip count.
1551  const SCEV *IterationCount = SE.getAddExpr(BackedgeTakenCount, One);
1552  if (IterationCount != SE.getSCEV(Sel)) return Cond;
1553
1554  // Check for a max calculation that matches the pattern. There's no check
1555  // for ICMP_ULE here because the comparison would be with zero, which
1556  // isn't interesting.
1557  CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
1558  const SCEVNAryExpr *Max = 0;
1559  if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
1560    Pred = ICmpInst::ICMP_SLE;
1561    Max = S;
1562  } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
1563    Pred = ICmpInst::ICMP_SLT;
1564    Max = S;
1565  } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
1566    Pred = ICmpInst::ICMP_ULT;
1567    Max = U;
1568  } else {
1569    // No match; bail.
1570    return Cond;
1571  }
1572
1573  // To handle a max with more than two operands, this optimization would
1574  // require additional checking and setup.
1575  if (Max->getNumOperands() != 2)
1576    return Cond;
1577
1578  const SCEV *MaxLHS = Max->getOperand(0);
1579  const SCEV *MaxRHS = Max->getOperand(1);
1580
1581  // ScalarEvolution canonicalizes constants to the left. For < and >, look
1582  // for a comparison with 1. For <= and >=, a comparison with zero.
1583  if (!MaxLHS ||
1584      (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
1585    return Cond;
1586
1587  // Check the relevant induction variable for conformance to
1588  // the pattern.
1589  const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
1590  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
1591  if (!AR || !AR->isAffine() ||
1592      AR->getStart() != One ||
1593      AR->getStepRecurrence(SE) != One)
1594    return Cond;
1595
1596  assert(AR->getLoop() == L &&
1597         "Loop condition operand is an addrec in a different loop!");
1598
1599  // Check the right operand of the select, and remember it, as it will
1600  // be used in the new comparison instruction.
1601  Value *NewRHS = 0;
1602  if (ICmpInst::isTrueWhenEqual(Pred)) {
1603    // Look for n+1, and grab n.
1604    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
1605      if (isa<ConstantInt>(BO->getOperand(1)) &&
1606          cast<ConstantInt>(BO->getOperand(1))->isOne() &&
1607          SE.getSCEV(BO->getOperand(0)) == MaxRHS)
1608        NewRHS = BO->getOperand(0);
1609    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
1610      if (isa<ConstantInt>(BO->getOperand(1)) &&
1611          cast<ConstantInt>(BO->getOperand(1))->isOne() &&
1612          SE.getSCEV(BO->getOperand(0)) == MaxRHS)
1613        NewRHS = BO->getOperand(0);
1614    if (!NewRHS)
1615      return Cond;
1616  } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
1617    NewRHS = Sel->getOperand(1);
1618  else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
1619    NewRHS = Sel->getOperand(2);
1620  else
1621    llvm_unreachable("Max doesn't match expected pattern!");
1622
1623  // Determine the new comparison opcode. It may be signed or unsigned,
1624  // and the original comparison may be either equality or inequality.
1625  if (Cond->getPredicate() == CmpInst::ICMP_EQ)
1626    Pred = CmpInst::getInversePredicate(Pred);
1627
1628  // Ok, everything looks ok to change the condition into an SLT or SGE and
1629  // delete the max calculation.
1630  ICmpInst *NewCond =
1631    new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
1632
1633  // Delete the max calculation instructions.
1634  Cond->replaceAllUsesWith(NewCond);
1635  CondUse->setUser(NewCond);
1636  Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
1637  Cond->eraseFromParent();
1638  Sel->eraseFromParent();
1639  if (Cmp->use_empty())
1640    Cmp->eraseFromParent();
1641  return NewCond;
1642}
1643
1644/// OptimizeLoopTermCond - Change loop terminating condition to use the
1645/// postinc iv when possible.
1646bool
1647LSRInstance::OptimizeLoopTermCond() {
1648  SmallPtrSet<Instruction *, 4> PostIncs;
1649
1650  BasicBlock *LatchBlock = L->getLoopLatch();
1651  SmallVector<BasicBlock*, 8> ExitingBlocks;
1652  L->getExitingBlocks(ExitingBlocks);
1653
1654  for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
1655    BasicBlock *ExitingBlock = ExitingBlocks[i];
1656
1657    // Get the terminating condition for the loop if possible.  If we
1658    // can, we want to change it to use a post-incremented version of its
1659    // induction variable, to allow coalescing the live ranges for the IV into
1660    // one register value.
1661
1662    BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
1663    if (!TermBr)
1664      continue;
1665    // FIXME: Overly conservative, termination condition could be an 'or' etc..
1666    if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
1667      continue;
1668
1669    // Search IVUsesByStride to find Cond's IVUse if there is one.
1670    IVStrideUse *CondUse = 0;
1671    ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
1672    if (!FindIVUserForCond(Cond, CondUse))
1673      continue;
1674
1675    // If the trip count is computed in terms of a max (due to ScalarEvolution
1676    // being unable to find a sufficient guard, for example), change the loop
1677    // comparison to use SLT or ULT instead of NE.
1678    // One consequence of doing this now is that it disrupts the count-down
1679    // optimization. That's not always a bad thing though, because in such
1680    // cases it may still be worthwhile to avoid a max.
1681    Cond = OptimizeMax(Cond, CondUse);
1682
1683    // If this exiting block dominates the latch block, it may also use
1684    // the post-inc value if it won't be shared with other uses.
1685    // Check for dominance.
1686    if (!DT.dominates(ExitingBlock, LatchBlock))
1687      continue;
1688
1689    // Conservatively avoid trying to use the post-inc value in non-latch
1690    // exits if there may be pre-inc users in intervening blocks.
1691    if (LatchBlock != ExitingBlock)
1692      for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
1693        // Test if the use is reachable from the exiting block. This dominator
1694        // query is a conservative approximation of reachability.
1695        if (&*UI != CondUse &&
1696            !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
1697          // Conservatively assume there may be reuse if the quotient of their
1698          // strides could be a legal scale.
1699          const SCEV *A = IU.getStride(*CondUse, L);
1700          const SCEV *B = IU.getStride(*UI, L);
1701          if (!A || !B) continue;
1702          if (SE.getTypeSizeInBits(A->getType()) !=
1703              SE.getTypeSizeInBits(B->getType())) {
1704            if (SE.getTypeSizeInBits(A->getType()) >
1705                SE.getTypeSizeInBits(B->getType()))
1706              B = SE.getSignExtendExpr(B, A->getType());
1707            else
1708              A = SE.getSignExtendExpr(A, B->getType());
1709          }
1710          if (const SCEVConstant *D =
1711                dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
1712            // Stride of one or negative one can have reuse with non-addresses.
1713            if (D->getValue()->isOne() ||
1714                D->getValue()->isAllOnesValue())
1715              goto decline_post_inc;
1716            // Avoid weird situations.
1717            if (D->getValue()->getValue().getMinSignedBits() >= 64 ||
1718                D->getValue()->getValue().isMinSignedValue())
1719              goto decline_post_inc;
1720            // Without TLI, assume that any stride might be valid, and so any
1721            // use might be shared.
1722            if (!TLI)
1723              goto decline_post_inc;
1724            // Check for possible scaled-address reuse.
1725            const Type *AccessTy = getAccessType(UI->getUser());
1726            TargetLowering::AddrMode AM;
1727            AM.Scale = D->getValue()->getSExtValue();
1728            if (TLI->isLegalAddressingMode(AM, AccessTy))
1729              goto decline_post_inc;
1730            AM.Scale = -AM.Scale;
1731            if (TLI->isLegalAddressingMode(AM, AccessTy))
1732              goto decline_post_inc;
1733          }
1734        }
1735
1736    DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
1737                 << *Cond << '\n');
1738
1739    // It's possible for the setcc instruction to be anywhere in the loop, and
1740    // possible for it to have multiple users.  If it is not immediately before
1741    // the exiting block branch, move it.
1742    if (&*++BasicBlock::iterator(Cond) != TermBr) {
1743      if (Cond->hasOneUse()) {
1744        Cond->moveBefore(TermBr);
1745      } else {
1746        // Clone the terminating condition and insert into the loopend.
1747        ICmpInst *OldCond = Cond;
1748        Cond = cast<ICmpInst>(Cond->clone());
1749        Cond->setName(L->getHeader()->getName() + ".termcond");
1750        ExitingBlock->getInstList().insert(TermBr, Cond);
1751
1752        // Clone the IVUse, as the old use still exists!
1753        CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
1754        TermBr->replaceUsesOfWith(OldCond, Cond);
1755      }
1756    }
1757
1758    // If we get to here, we know that we can transform the setcc instruction to
1759    // use the post-incremented version of the IV, allowing us to coalesce the
1760    // live ranges for the IV correctly.
1761    CondUse->transformToPostInc(L);
1762    Changed = true;
1763
1764    PostIncs.insert(Cond);
1765  decline_post_inc:;
1766  }
1767
1768  // Determine an insertion point for the loop induction variable increment. It
1769  // must dominate all the post-inc comparisons we just set up, and it must
1770  // dominate the loop latch edge.
1771  IVIncInsertPos = L->getLoopLatch()->getTerminator();
1772  for (SmallPtrSet<Instruction *, 4>::const_iterator I = PostIncs.begin(),
1773       E = PostIncs.end(); I != E; ++I) {
1774    BasicBlock *BB =
1775      DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
1776                                    (*I)->getParent());
1777    if (BB == (*I)->getParent())
1778      IVIncInsertPos = *I;
1779    else if (BB != IVIncInsertPos->getParent())
1780      IVIncInsertPos = BB->getTerminator();
1781  }
1782
1783  return Changed;
1784}
1785
1786bool
1787LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
1788                                bool HasBaseReg,
1789                                LSRUse::KindType Kind, const Type *AccessTy) {
1790  int64_t NewMinOffset = LU.MinOffset;
1791  int64_t NewMaxOffset = LU.MaxOffset;
1792  const Type *NewAccessTy = AccessTy;
1793
1794  // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
1795  // something conservative, however this can pessimize in the case that one of
1796  // the uses will have all its uses outside the loop, for example.
1797  if (LU.Kind != Kind)
1798    return false;
1799  // Conservatively assume HasBaseReg is true for now.
1800  if (NewOffset < LU.MinOffset) {
1801    if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, HasBaseReg,
1802                          Kind, AccessTy, TLI))
1803      return false;
1804    NewMinOffset = NewOffset;
1805  } else if (NewOffset > LU.MaxOffset) {
1806    if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, HasBaseReg,
1807                          Kind, AccessTy, TLI))
1808      return false;
1809    NewMaxOffset = NewOffset;
1810  }
1811  // Check for a mismatched access type, and fall back conservatively as needed.
1812  if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
1813    NewAccessTy = Type::getVoidTy(AccessTy->getContext());
1814
1815  // Update the use.
1816  LU.MinOffset = NewMinOffset;
1817  LU.MaxOffset = NewMaxOffset;
1818  LU.AccessTy = NewAccessTy;
1819  if (NewOffset != LU.Offsets.back())
1820    LU.Offsets.push_back(NewOffset);
1821  return true;
1822}
1823
1824/// getUse - Return an LSRUse index and an offset value for a fixup which
1825/// needs the given expression, with the given kind and optional access type.
1826/// Either reuse an existing use or create a new one, as needed.
1827std::pair<size_t, int64_t>
1828LSRInstance::getUse(const SCEV *&Expr,
1829                    LSRUse::KindType Kind, const Type *AccessTy) {
1830  const SCEV *Copy = Expr;
1831  int64_t Offset = ExtractImmediate(Expr, SE);
1832
1833  // Basic uses can't accept any offset, for example.
1834  if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, TLI)) {
1835    Expr = Copy;
1836    Offset = 0;
1837  }
1838
1839  std::pair<UseMapTy::iterator, bool> P =
1840    UseMap.insert(std::make_pair(Expr, 0));
1841  if (!P.second) {
1842    // A use already existed with this base.
1843    size_t LUIdx = P.first->second;
1844    LSRUse &LU = Uses[LUIdx];
1845    if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
1846      // Reuse this use.
1847      return std::make_pair(LUIdx, Offset);
1848  }
1849
1850  // Create a new use.
1851  size_t LUIdx = Uses.size();
1852  P.first->second = LUIdx;
1853  Uses.push_back(LSRUse(Kind, AccessTy));
1854  LSRUse &LU = Uses[LUIdx];
1855
1856  // We don't need to track redundant offsets, but we don't need to go out
1857  // of our way here to avoid them.
1858  if (LU.Offsets.empty() || Offset != LU.Offsets.back())
1859    LU.Offsets.push_back(Offset);
1860
1861  LU.MinOffset = Offset;
1862  LU.MaxOffset = Offset;
1863  return std::make_pair(LUIdx, Offset);
1864}
1865
1866/// DeleteUse - Delete the given use from the Uses list.
1867void LSRInstance::DeleteUse(LSRUse &LU) {
1868  if (&LU != &Uses.back())
1869    std::swap(LU, Uses.back());
1870  Uses.pop_back();
1871}
1872
1873/// FindUseWithFormula - Look for a use distinct from OrigLU which is has
1874/// a formula that has the same registers as the given formula.
1875LSRUse *
1876LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
1877                                       const LSRUse &OrigLU) {
1878  // Search all uses for the formula. This could be more clever. Ignore
1879  // ICmpZero uses because they may contain formulae generated by
1880  // GenerateICmpZeroScales, in which case adding fixup offsets may
1881  // be invalid.
1882  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
1883    LSRUse &LU = Uses[LUIdx];
1884    if (&LU != &OrigLU &&
1885        LU.Kind != LSRUse::ICmpZero &&
1886        LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
1887        LU.HasFormulaWithSameRegs(OrigF)) {
1888      for (size_t FIdx = 0, NumForms = LU.Formulae.size();
1889           FIdx != NumForms; ++FIdx) {
1890        Formula &F = LU.Formulae[FIdx];
1891        if (F.BaseRegs == OrigF.BaseRegs &&
1892            F.ScaledReg == OrigF.ScaledReg &&
1893            F.AM.BaseGV == OrigF.AM.BaseGV &&
1894            F.AM.Scale == OrigF.AM.Scale &&
1895            LU.Kind) {
1896          if (F.AM.BaseOffs == 0)
1897            return &LU;
1898          break;
1899        }
1900      }
1901    }
1902  }
1903
1904  return 0;
1905}
1906
1907void LSRInstance::CollectInterestingTypesAndFactors() {
1908  SmallSetVector<const SCEV *, 4> Strides;
1909
1910  // Collect interesting types and strides.
1911  SmallVector<const SCEV *, 4> Worklist;
1912  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
1913    const SCEV *Expr = IU.getExpr(*UI);
1914
1915    // Collect interesting types.
1916    Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
1917
1918    // Add strides for mentioned loops.
1919    Worklist.push_back(Expr);
1920    do {
1921      const SCEV *S = Worklist.pop_back_val();
1922      if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1923        Strides.insert(AR->getStepRecurrence(SE));
1924        Worklist.push_back(AR->getStart());
1925      } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1926        Worklist.insert(Worklist.end(), Add->op_begin(), Add->op_end());
1927      }
1928    } while (!Worklist.empty());
1929  }
1930
1931  // Compute interesting factors from the set of interesting strides.
1932  for (SmallSetVector<const SCEV *, 4>::const_iterator
1933       I = Strides.begin(), E = Strides.end(); I != E; ++I)
1934    for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
1935         next(I); NewStrideIter != E; ++NewStrideIter) {
1936      const SCEV *OldStride = *I;
1937      const SCEV *NewStride = *NewStrideIter;
1938
1939      if (SE.getTypeSizeInBits(OldStride->getType()) !=
1940          SE.getTypeSizeInBits(NewStride->getType())) {
1941        if (SE.getTypeSizeInBits(OldStride->getType()) >
1942            SE.getTypeSizeInBits(NewStride->getType()))
1943          NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
1944        else
1945          OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
1946      }
1947      if (const SCEVConstant *Factor =
1948            dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
1949                                                        SE, true))) {
1950        if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
1951          Factors.insert(Factor->getValue()->getValue().getSExtValue());
1952      } else if (const SCEVConstant *Factor =
1953                   dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
1954                                                               NewStride,
1955                                                               SE, true))) {
1956        if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
1957          Factors.insert(Factor->getValue()->getValue().getSExtValue());
1958      }
1959    }
1960
1961  // If all uses use the same type, don't bother looking for truncation-based
1962  // reuse.
1963  if (Types.size() == 1)
1964    Types.clear();
1965
1966  DEBUG(print_factors_and_types(dbgs()));
1967}
1968
1969void LSRInstance::CollectFixupsAndInitialFormulae() {
1970  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
1971    // Record the uses.
1972    LSRFixup &LF = getNewFixup();
1973    LF.UserInst = UI->getUser();
1974    LF.OperandValToReplace = UI->getOperandValToReplace();
1975    LF.PostIncLoops = UI->getPostIncLoops();
1976
1977    LSRUse::KindType Kind = LSRUse::Basic;
1978    const Type *AccessTy = 0;
1979    if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
1980      Kind = LSRUse::Address;
1981      AccessTy = getAccessType(LF.UserInst);
1982    }
1983
1984    const SCEV *S = IU.getExpr(*UI);
1985
1986    // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
1987    // (N - i == 0), and this allows (N - i) to be the expression that we work
1988    // with rather than just N or i, so we can consider the register
1989    // requirements for both N and i at the same time. Limiting this code to
1990    // equality icmps is not a problem because all interesting loops use
1991    // equality icmps, thanks to IndVarSimplify.
1992    if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst))
1993      if (CI->isEquality()) {
1994        // Swap the operands if needed to put the OperandValToReplace on the
1995        // left, for consistency.
1996        Value *NV = CI->getOperand(1);
1997        if (NV == LF.OperandValToReplace) {
1998          CI->setOperand(1, CI->getOperand(0));
1999          CI->setOperand(0, NV);
2000          NV = CI->getOperand(1);
2001          Changed = true;
2002        }
2003
2004        // x == y  -->  x - y == 0
2005        const SCEV *N = SE.getSCEV(NV);
2006        if (N->isLoopInvariant(L)) {
2007          Kind = LSRUse::ICmpZero;
2008          S = SE.getMinusSCEV(N, S);
2009        }
2010
2011        // -1 and the negations of all interesting strides (except the negation
2012        // of -1) are now also interesting.
2013        for (size_t i = 0, e = Factors.size(); i != e; ++i)
2014          if (Factors[i] != -1)
2015            Factors.insert(-(uint64_t)Factors[i]);
2016        Factors.insert(-1);
2017      }
2018
2019    // Set up the initial formula for this use.
2020    std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
2021    LF.LUIdx = P.first;
2022    LF.Offset = P.second;
2023    LSRUse &LU = Uses[LF.LUIdx];
2024    LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
2025
2026    // If this is the first use of this LSRUse, give it a formula.
2027    if (LU.Formulae.empty()) {
2028      InsertInitialFormula(S, LU, LF.LUIdx);
2029      CountRegisters(LU.Formulae.back(), LF.LUIdx);
2030    }
2031  }
2032
2033  DEBUG(print_fixups(dbgs()));
2034}
2035
2036void
2037LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
2038  Formula F;
2039  F.InitialMatch(S, L, SE, DT);
2040  bool Inserted = InsertFormula(LU, LUIdx, F);
2041  assert(Inserted && "Initial formula already exists!"); (void)Inserted;
2042}
2043
2044void
2045LSRInstance::InsertSupplementalFormula(const SCEV *S,
2046                                       LSRUse &LU, size_t LUIdx) {
2047  Formula F;
2048  F.BaseRegs.push_back(S);
2049  F.AM.HasBaseReg = true;
2050  bool Inserted = InsertFormula(LU, LUIdx, F);
2051  assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
2052}
2053
2054/// CountRegisters - Note which registers are used by the given formula,
2055/// updating RegUses.
2056void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
2057  if (F.ScaledReg)
2058    RegUses.CountRegister(F.ScaledReg, LUIdx);
2059  for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
2060       E = F.BaseRegs.end(); I != E; ++I)
2061    RegUses.CountRegister(*I, LUIdx);
2062}
2063
2064/// InsertFormula - If the given formula has not yet been inserted, add it to
2065/// the list, and return true. Return false otherwise.
2066bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
2067  if (!LU.InsertFormula(F))
2068    return false;
2069
2070  CountRegisters(F, LUIdx);
2071  return true;
2072}
2073
2074/// CollectLoopInvariantFixupsAndFormulae - Check for other uses of
2075/// loop-invariant values which we're tracking. These other uses will pin these
2076/// values in registers, making them less profitable for elimination.
2077/// TODO: This currently misses non-constant addrec step registers.
2078/// TODO: Should this give more weight to users inside the loop?
2079void
2080LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
2081  SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
2082  SmallPtrSet<const SCEV *, 8> Inserted;
2083
2084  while (!Worklist.empty()) {
2085    const SCEV *S = Worklist.pop_back_val();
2086
2087    if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
2088      Worklist.insert(Worklist.end(), N->op_begin(), N->op_end());
2089    else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
2090      Worklist.push_back(C->getOperand());
2091    else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
2092      Worklist.push_back(D->getLHS());
2093      Worklist.push_back(D->getRHS());
2094    } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
2095      if (!Inserted.insert(U)) continue;
2096      const Value *V = U->getValue();
2097      if (const Instruction *Inst = dyn_cast<Instruction>(V))
2098        if (L->contains(Inst)) continue;
2099      for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
2100           UI != UE; ++UI) {
2101        const Instruction *UserInst = dyn_cast<Instruction>(*UI);
2102        // Ignore non-instructions.
2103        if (!UserInst)
2104          continue;
2105        // Ignore instructions in other functions (as can happen with
2106        // Constants).
2107        if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
2108          continue;
2109        // Ignore instructions not dominated by the loop.
2110        const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
2111          UserInst->getParent() :
2112          cast<PHINode>(UserInst)->getIncomingBlock(
2113            PHINode::getIncomingValueNumForOperand(UI.getOperandNo()));
2114        if (!DT.dominates(L->getHeader(), UseBB))
2115          continue;
2116        // Ignore uses which are part of other SCEV expressions, to avoid
2117        // analyzing them multiple times.
2118        if (SE.isSCEVable(UserInst->getType())) {
2119          const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
2120          // If the user is a no-op, look through to its uses.
2121          if (!isa<SCEVUnknown>(UserS))
2122            continue;
2123          if (UserS == U) {
2124            Worklist.push_back(
2125              SE.getUnknown(const_cast<Instruction *>(UserInst)));
2126            continue;
2127          }
2128        }
2129        // Ignore icmp instructions which are already being analyzed.
2130        if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
2131          unsigned OtherIdx = !UI.getOperandNo();
2132          Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
2133          if (SE.getSCEV(OtherOp)->hasComputableLoopEvolution(L))
2134            continue;
2135        }
2136
2137        LSRFixup &LF = getNewFixup();
2138        LF.UserInst = const_cast<Instruction *>(UserInst);
2139        LF.OperandValToReplace = UI.getUse();
2140        std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, 0);
2141        LF.LUIdx = P.first;
2142        LF.Offset = P.second;
2143        LSRUse &LU = Uses[LF.LUIdx];
2144        LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
2145        InsertSupplementalFormula(U, LU, LF.LUIdx);
2146        CountRegisters(LU.Formulae.back(), Uses.size() - 1);
2147        break;
2148      }
2149    }
2150  }
2151}
2152
2153/// CollectSubexprs - Split S into subexpressions which can be pulled out into
2154/// separate registers. If C is non-null, multiply each subexpression by C.
2155static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
2156                            SmallVectorImpl<const SCEV *> &Ops,
2157                            ScalarEvolution &SE) {
2158  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2159    // Break out add operands.
2160    for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
2161         I != E; ++I)
2162      CollectSubexprs(*I, C, Ops, SE);
2163    return;
2164  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2165    // Split a non-zero base out of an addrec.
2166    if (!AR->getStart()->isZero()) {
2167      CollectSubexprs(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
2168                                       AR->getStepRecurrence(SE),
2169                                       AR->getLoop()), C, Ops, SE);
2170      CollectSubexprs(AR->getStart(), C, Ops, SE);
2171      return;
2172    }
2173  } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
2174    // Break (C * (a + b + c)) into C*a + C*b + C*c.
2175    if (Mul->getNumOperands() == 2)
2176      if (const SCEVConstant *Op0 =
2177            dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
2178        CollectSubexprs(Mul->getOperand(1),
2179                        C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0,
2180                        Ops, SE);
2181        return;
2182      }
2183  }
2184
2185  // Otherwise use the value itself.
2186  Ops.push_back(C ? SE.getMulExpr(C, S) : S);
2187}
2188
2189/// GenerateReassociations - Split out subexpressions from adds and the bases of
2190/// addrecs.
2191void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
2192                                         Formula Base,
2193                                         unsigned Depth) {
2194  // Arbitrarily cap recursion to protect compile time.
2195  if (Depth >= 3) return;
2196
2197  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
2198    const SCEV *BaseReg = Base.BaseRegs[i];
2199
2200    SmallVector<const SCEV *, 8> AddOps;
2201    CollectSubexprs(BaseReg, 0, AddOps, SE);
2202    if (AddOps.size() == 1) continue;
2203
2204    for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
2205         JE = AddOps.end(); J != JE; ++J) {
2206      // Don't pull a constant into a register if the constant could be folded
2207      // into an immediate field.
2208      if (isAlwaysFoldable(*J, LU.MinOffset, LU.MaxOffset,
2209                           Base.getNumRegs() > 1,
2210                           LU.Kind, LU.AccessTy, TLI, SE))
2211        continue;
2212
2213      // Collect all operands except *J.
2214      SmallVector<const SCEV *, 8> InnerAddOps;
2215      for (SmallVectorImpl<const SCEV *>::const_iterator K = AddOps.begin(),
2216           KE = AddOps.end(); K != KE; ++K)
2217        if (K != J)
2218          InnerAddOps.push_back(*K);
2219
2220      // Don't leave just a constant behind in a register if the constant could
2221      // be folded into an immediate field.
2222      if (InnerAddOps.size() == 1 &&
2223          isAlwaysFoldable(InnerAddOps[0], LU.MinOffset, LU.MaxOffset,
2224                           Base.getNumRegs() > 1,
2225                           LU.Kind, LU.AccessTy, TLI, SE))
2226        continue;
2227
2228      const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
2229      if (InnerSum->isZero())
2230        continue;
2231      Formula F = Base;
2232      F.BaseRegs[i] = InnerSum;
2233      F.BaseRegs.push_back(*J);
2234      if (InsertFormula(LU, LUIdx, F))
2235        // If that formula hadn't been seen before, recurse to find more like
2236        // it.
2237        GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth+1);
2238    }
2239  }
2240}
2241
2242/// GenerateCombinations - Generate a formula consisting of all of the
2243/// loop-dominating registers added into a single register.
2244void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
2245                                       Formula Base) {
2246  // This method is only interesting on a plurality of registers.
2247  if (Base.BaseRegs.size() <= 1) return;
2248
2249  Formula F = Base;
2250  F.BaseRegs.clear();
2251  SmallVector<const SCEV *, 4> Ops;
2252  for (SmallVectorImpl<const SCEV *>::const_iterator
2253       I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) {
2254    const SCEV *BaseReg = *I;
2255    if (BaseReg->properlyDominates(L->getHeader(), &DT) &&
2256        !BaseReg->hasComputableLoopEvolution(L))
2257      Ops.push_back(BaseReg);
2258    else
2259      F.BaseRegs.push_back(BaseReg);
2260  }
2261  if (Ops.size() > 1) {
2262    const SCEV *Sum = SE.getAddExpr(Ops);
2263    // TODO: If Sum is zero, it probably means ScalarEvolution missed an
2264    // opportunity to fold something. For now, just ignore such cases
2265    // rather than proceed with zero in a register.
2266    if (!Sum->isZero()) {
2267      F.BaseRegs.push_back(Sum);
2268      (void)InsertFormula(LU, LUIdx, F);
2269    }
2270  }
2271}
2272
2273/// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets.
2274void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
2275                                          Formula Base) {
2276  // We can't add a symbolic offset if the address already contains one.
2277  if (Base.AM.BaseGV) return;
2278
2279  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
2280    const SCEV *G = Base.BaseRegs[i];
2281    GlobalValue *GV = ExtractSymbol(G, SE);
2282    if (G->isZero() || !GV)
2283      continue;
2284    Formula F = Base;
2285    F.AM.BaseGV = GV;
2286    if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
2287                    LU.Kind, LU.AccessTy, TLI))
2288      continue;
2289    F.BaseRegs[i] = G;
2290    (void)InsertFormula(LU, LUIdx, F);
2291  }
2292}
2293
2294/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
2295void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
2296                                          Formula Base) {
2297  // TODO: For now, just add the min and max offset, because it usually isn't
2298  // worthwhile looking at everything inbetween.
2299  SmallVector<int64_t, 4> Worklist;
2300  Worklist.push_back(LU.MinOffset);
2301  if (LU.MaxOffset != LU.MinOffset)
2302    Worklist.push_back(LU.MaxOffset);
2303
2304  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
2305    const SCEV *G = Base.BaseRegs[i];
2306
2307    for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
2308         E = Worklist.end(); I != E; ++I) {
2309      Formula F = Base;
2310      F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I;
2311      if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I,
2312                     LU.Kind, LU.AccessTy, TLI)) {
2313        F.BaseRegs[i] = SE.getAddExpr(G, SE.getConstant(G->getType(), *I));
2314
2315        (void)InsertFormula(LU, LUIdx, F);
2316      }
2317    }
2318
2319    int64_t Imm = ExtractImmediate(G, SE);
2320    if (G->isZero() || Imm == 0)
2321      continue;
2322    Formula F = Base;
2323    F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Imm;
2324    if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
2325                    LU.Kind, LU.AccessTy, TLI))
2326      continue;
2327    F.BaseRegs[i] = G;
2328    (void)InsertFormula(LU, LUIdx, F);
2329  }
2330}
2331
2332/// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up
2333/// the comparison. For example, x == y -> x*c == y*c.
2334void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
2335                                         Formula Base) {
2336  if (LU.Kind != LSRUse::ICmpZero) return;
2337
2338  // Determine the integer type for the base formula.
2339  const Type *IntTy = Base.getType();
2340  if (!IntTy) return;
2341  if (SE.getTypeSizeInBits(IntTy) > 64) return;
2342
2343  // Don't do this if there is more than one offset.
2344  if (LU.MinOffset != LU.MaxOffset) return;
2345
2346  assert(!Base.AM.BaseGV && "ICmpZero use is not legal!");
2347
2348  // Check each interesting stride.
2349  for (SmallSetVector<int64_t, 8>::const_iterator
2350       I = Factors.begin(), E = Factors.end(); I != E; ++I) {
2351    int64_t Factor = *I;
2352    Formula F = Base;
2353
2354    // Check that the multiplication doesn't overflow.
2355    if (F.AM.BaseOffs == INT64_MIN && Factor == -1)
2356      continue;
2357    F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs * Factor;
2358    if (F.AM.BaseOffs / Factor != Base.AM.BaseOffs)
2359      continue;
2360
2361    // Check that multiplying with the use offset doesn't overflow.
2362    int64_t Offset = LU.MinOffset;
2363    if (Offset == INT64_MIN && Factor == -1)
2364      continue;
2365    Offset = (uint64_t)Offset * Factor;
2366    if (Offset / Factor != LU.MinOffset)
2367      continue;
2368
2369    // Check that this scale is legal.
2370    if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI))
2371      continue;
2372
2373    // Compensate for the use having MinOffset built into it.
2374    F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Offset - LU.MinOffset;
2375
2376    const SCEV *FactorS = SE.getConstant(IntTy, Factor);
2377
2378    // Check that multiplying with each base register doesn't overflow.
2379    for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
2380      F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
2381      if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
2382        goto next;
2383    }
2384
2385    // Check that multiplying with the scaled register doesn't overflow.
2386    if (F.ScaledReg) {
2387      F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
2388      if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
2389        continue;
2390    }
2391
2392    // If we make it here and it's legal, add it.
2393    (void)InsertFormula(LU, LUIdx, F);
2394  next:;
2395  }
2396}
2397
2398/// GenerateScales - Generate stride factor reuse formulae by making use of
2399/// scaled-offset address modes, for example.
2400void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx,
2401                                 Formula Base) {
2402  // Determine the integer type for the base formula.
2403  const Type *IntTy = Base.getType();
2404  if (!IntTy) return;
2405
2406  // If this Formula already has a scaled register, we can't add another one.
2407  if (Base.AM.Scale != 0) return;
2408
2409  // Check each interesting stride.
2410  for (SmallSetVector<int64_t, 8>::const_iterator
2411       I = Factors.begin(), E = Factors.end(); I != E; ++I) {
2412    int64_t Factor = *I;
2413
2414    Base.AM.Scale = Factor;
2415    Base.AM.HasBaseReg = Base.BaseRegs.size() > 1;
2416    // Check whether this scale is going to be legal.
2417    if (!isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
2418                    LU.Kind, LU.AccessTy, TLI)) {
2419      // As a special-case, handle special out-of-loop Basic users specially.
2420      // TODO: Reconsider this special case.
2421      if (LU.Kind == LSRUse::Basic &&
2422          isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
2423                     LSRUse::Special, LU.AccessTy, TLI) &&
2424          LU.AllFixupsOutsideLoop)
2425        LU.Kind = LSRUse::Special;
2426      else
2427        continue;
2428    }
2429    // For an ICmpZero, negating a solitary base register won't lead to
2430    // new solutions.
2431    if (LU.Kind == LSRUse::ICmpZero &&
2432        !Base.AM.HasBaseReg && Base.AM.BaseOffs == 0 && !Base.AM.BaseGV)
2433      continue;
2434    // For each addrec base reg, apply the scale, if possible.
2435    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
2436      if (const SCEVAddRecExpr *AR =
2437            dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
2438        const SCEV *FactorS = SE.getConstant(IntTy, Factor);
2439        if (FactorS->isZero())
2440          continue;
2441        // Divide out the factor, ignoring high bits, since we'll be
2442        // scaling the value back up in the end.
2443        if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
2444          // TODO: This could be optimized to avoid all the copying.
2445          Formula F = Base;
2446          F.ScaledReg = Quotient;
2447          F.DeleteBaseReg(F.BaseRegs[i]);
2448          (void)InsertFormula(LU, LUIdx, F);
2449        }
2450      }
2451  }
2452}
2453
2454/// GenerateTruncates - Generate reuse formulae from different IV types.
2455void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx,
2456                                    Formula Base) {
2457  // This requires TargetLowering to tell us which truncates are free.
2458  if (!TLI) return;
2459
2460  // Don't bother truncating symbolic values.
2461  if (Base.AM.BaseGV) return;
2462
2463  // Determine the integer type for the base formula.
2464  const Type *DstTy = Base.getType();
2465  if (!DstTy) return;
2466  DstTy = SE.getEffectiveSCEVType(DstTy);
2467
2468  for (SmallSetVector<const Type *, 4>::const_iterator
2469       I = Types.begin(), E = Types.end(); I != E; ++I) {
2470    const Type *SrcTy = *I;
2471    if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) {
2472      Formula F = Base;
2473
2474      if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
2475      for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(),
2476           JE = F.BaseRegs.end(); J != JE; ++J)
2477        *J = SE.getAnyExtendExpr(*J, SrcTy);
2478
2479      // TODO: This assumes we've done basic processing on all uses and
2480      // have an idea what the register usage is.
2481      if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
2482        continue;
2483
2484      (void)InsertFormula(LU, LUIdx, F);
2485    }
2486  }
2487}
2488
2489namespace {
2490
2491/// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to
2492/// defer modifications so that the search phase doesn't have to worry about
2493/// the data structures moving underneath it.
2494struct WorkItem {
2495  size_t LUIdx;
2496  int64_t Imm;
2497  const SCEV *OrigReg;
2498
2499  WorkItem(size_t LI, int64_t I, const SCEV *R)
2500    : LUIdx(LI), Imm(I), OrigReg(R) {}
2501
2502  void print(raw_ostream &OS) const;
2503  void dump() const;
2504};
2505
2506}
2507
2508void WorkItem::print(raw_ostream &OS) const {
2509  OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
2510     << " , add offset " << Imm;
2511}
2512
2513void WorkItem::dump() const {
2514  print(errs()); errs() << '\n';
2515}
2516
2517/// GenerateCrossUseConstantOffsets - Look for registers which are a constant
2518/// distance apart and try to form reuse opportunities between them.
2519void LSRInstance::GenerateCrossUseConstantOffsets() {
2520  // Group the registers by their value without any added constant offset.
2521  typedef std::map<int64_t, const SCEV *> ImmMapTy;
2522  typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy;
2523  RegMapTy Map;
2524  DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
2525  SmallVector<const SCEV *, 8> Sequence;
2526  for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
2527       I != E; ++I) {
2528    const SCEV *Reg = *I;
2529    int64_t Imm = ExtractImmediate(Reg, SE);
2530    std::pair<RegMapTy::iterator, bool> Pair =
2531      Map.insert(std::make_pair(Reg, ImmMapTy()));
2532    if (Pair.second)
2533      Sequence.push_back(Reg);
2534    Pair.first->second.insert(std::make_pair(Imm, *I));
2535    UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(*I);
2536  }
2537
2538  // Now examine each set of registers with the same base value. Build up
2539  // a list of work to do and do the work in a separate step so that we're
2540  // not adding formulae and register counts while we're searching.
2541  SmallVector<WorkItem, 32> WorkItems;
2542  SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
2543  for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
2544       E = Sequence.end(); I != E; ++I) {
2545    const SCEV *Reg = *I;
2546    const ImmMapTy &Imms = Map.find(Reg)->second;
2547
2548    // It's not worthwhile looking for reuse if there's only one offset.
2549    if (Imms.size() == 1)
2550      continue;
2551
2552    DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
2553          for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
2554               J != JE; ++J)
2555            dbgs() << ' ' << J->first;
2556          dbgs() << '\n');
2557
2558    // Examine each offset.
2559    for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
2560         J != JE; ++J) {
2561      const SCEV *OrigReg = J->second;
2562
2563      int64_t JImm = J->first;
2564      const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
2565
2566      if (!isa<SCEVConstant>(OrigReg) &&
2567          UsedByIndicesMap[Reg].count() == 1) {
2568        DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
2569        continue;
2570      }
2571
2572      // Conservatively examine offsets between this orig reg a few selected
2573      // other orig regs.
2574      ImmMapTy::const_iterator OtherImms[] = {
2575        Imms.begin(), prior(Imms.end()),
2576        Imms.upper_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
2577      };
2578      for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
2579        ImmMapTy::const_iterator M = OtherImms[i];
2580        if (M == J || M == JE) continue;
2581
2582        // Compute the difference between the two.
2583        int64_t Imm = (uint64_t)JImm - M->first;
2584        for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
2585             LUIdx = UsedByIndices.find_next(LUIdx))
2586          // Make a memo of this use, offset, and register tuple.
2587          if (UniqueItems.insert(std::make_pair(LUIdx, Imm)))
2588            WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
2589      }
2590    }
2591  }
2592
2593  Map.clear();
2594  Sequence.clear();
2595  UsedByIndicesMap.clear();
2596  UniqueItems.clear();
2597
2598  // Now iterate through the worklist and add new formulae.
2599  for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(),
2600       E = WorkItems.end(); I != E; ++I) {
2601    const WorkItem &WI = *I;
2602    size_t LUIdx = WI.LUIdx;
2603    LSRUse &LU = Uses[LUIdx];
2604    int64_t Imm = WI.Imm;
2605    const SCEV *OrigReg = WI.OrigReg;
2606
2607    const Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
2608    const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
2609    unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
2610
2611    // TODO: Use a more targeted data structure.
2612    for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
2613      Formula F = LU.Formulae[L];
2614      // Use the immediate in the scaled register.
2615      if (F.ScaledReg == OrigReg) {
2616        int64_t Offs = (uint64_t)F.AM.BaseOffs +
2617                       Imm * (uint64_t)F.AM.Scale;
2618        // Don't create 50 + reg(-50).
2619        if (F.referencesReg(SE.getSCEV(
2620                   ConstantInt::get(IntTy, -(uint64_t)Offs))))
2621          continue;
2622        Formula NewF = F;
2623        NewF.AM.BaseOffs = Offs;
2624        if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
2625                        LU.Kind, LU.AccessTy, TLI))
2626          continue;
2627        NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
2628
2629        // If the new scale is a constant in a register, and adding the constant
2630        // value to the immediate would produce a value closer to zero than the
2631        // immediate itself, then the formula isn't worthwhile.
2632        if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
2633          if (C->getValue()->getValue().isNegative() !=
2634                (NewF.AM.BaseOffs < 0) &&
2635              (C->getValue()->getValue().abs() * APInt(BitWidth, F.AM.Scale))
2636                .ule(abs64(NewF.AM.BaseOffs)))
2637            continue;
2638
2639        // OK, looks good.
2640        (void)InsertFormula(LU, LUIdx, NewF);
2641      } else {
2642        // Use the immediate in a base register.
2643        for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
2644          const SCEV *BaseReg = F.BaseRegs[N];
2645          if (BaseReg != OrigReg)
2646            continue;
2647          Formula NewF = F;
2648          NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
2649          if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
2650                          LU.Kind, LU.AccessTy, TLI))
2651            continue;
2652          NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
2653
2654          // If the new formula has a constant in a register, and adding the
2655          // constant value to the immediate would produce a value closer to
2656          // zero than the immediate itself, then the formula isn't worthwhile.
2657          for (SmallVectorImpl<const SCEV *>::const_iterator
2658               J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end();
2659               J != JE; ++J)
2660            if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J))
2661              if ((C->getValue()->getValue() + NewF.AM.BaseOffs).abs().slt(
2662                   abs64(NewF.AM.BaseOffs)) &&
2663                  (C->getValue()->getValue() +
2664                   NewF.AM.BaseOffs).countTrailingZeros() >=
2665                   CountTrailingZeros_64(NewF.AM.BaseOffs))
2666                goto skip_formula;
2667
2668          // Ok, looks good.
2669          (void)InsertFormula(LU, LUIdx, NewF);
2670          break;
2671        skip_formula:;
2672        }
2673      }
2674    }
2675  }
2676}
2677
2678/// GenerateAllReuseFormulae - Generate formulae for each use.
2679void
2680LSRInstance::GenerateAllReuseFormulae() {
2681  // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
2682  // queries are more precise.
2683  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2684    LSRUse &LU = Uses[LUIdx];
2685    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2686      GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
2687    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2688      GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
2689  }
2690  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2691    LSRUse &LU = Uses[LUIdx];
2692    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2693      GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
2694    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2695      GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
2696    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2697      GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
2698    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2699      GenerateScales(LU, LUIdx, LU.Formulae[i]);
2700  }
2701  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2702    LSRUse &LU = Uses[LUIdx];
2703    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2704      GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
2705  }
2706
2707  GenerateCrossUseConstantOffsets();
2708}
2709
2710/// If their are multiple formulae with the same set of registers used
2711/// by other uses, pick the best one and delete the others.
2712void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
2713#ifndef NDEBUG
2714  bool Changed = false;
2715#endif
2716
2717  // Collect the best formula for each unique set of shared registers. This
2718  // is reset for each use.
2719  typedef DenseMap<SmallVector<const SCEV *, 2>, size_t, UniquifierDenseMapInfo>
2720    BestFormulaeTy;
2721  BestFormulaeTy BestFormulae;
2722
2723  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2724    LSRUse &LU = Uses[LUIdx];
2725    FormulaSorter Sorter(L, LU, SE, DT);
2726    DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << "\n");
2727
2728    bool Any = false;
2729    for (size_t FIdx = 0, NumForms = LU.Formulae.size();
2730         FIdx != NumForms; ++FIdx) {
2731      Formula &F = LU.Formulae[FIdx];
2732
2733      SmallVector<const SCEV *, 2> Key;
2734      for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
2735           JE = F.BaseRegs.end(); J != JE; ++J) {
2736        const SCEV *Reg = *J;
2737        if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
2738          Key.push_back(Reg);
2739      }
2740      if (F.ScaledReg &&
2741          RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
2742        Key.push_back(F.ScaledReg);
2743      // Unstable sort by host order ok, because this is only used for
2744      // uniquifying.
2745      std::sort(Key.begin(), Key.end());
2746
2747      std::pair<BestFormulaeTy::const_iterator, bool> P =
2748        BestFormulae.insert(std::make_pair(Key, FIdx));
2749      if (!P.second) {
2750        Formula &Best = LU.Formulae[P.first->second];
2751        if (Sorter.operator()(F, Best))
2752          std::swap(F, Best);
2753        DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
2754              dbgs() << "\n"
2755                        "    in favor of formula "; Best.print(dbgs());
2756              dbgs() << '\n');
2757#ifndef NDEBUG
2758        Changed = true;
2759#endif
2760        LU.DeleteFormula(F);
2761        --FIdx;
2762        --NumForms;
2763        Any = true;
2764        continue;
2765      }
2766    }
2767
2768    // Now that we've filtered out some formulae, recompute the Regs set.
2769    if (Any)
2770      LU.RecomputeRegs(LUIdx, RegUses);
2771
2772    // Reset this to prepare for the next use.
2773    BestFormulae.clear();
2774  }
2775
2776  DEBUG(if (Changed) {
2777          dbgs() << "\n"
2778                    "After filtering out undesirable candidates:\n";
2779          print_uses(dbgs());
2780        });
2781}
2782
2783// This is a rough guess that seems to work fairly well.
2784static const size_t ComplexityLimit = UINT16_MAX;
2785
2786/// EstimateSearchSpaceComplexity - Estimate the worst-case number of
2787/// solutions the solver might have to consider. It almost never considers
2788/// this many solutions because it prune the search space, but the pruning
2789/// isn't always sufficient.
2790size_t LSRInstance::EstimateSearchSpaceComplexity() const {
2791  uint32_t Power = 1;
2792  for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
2793       E = Uses.end(); I != E; ++I) {
2794    size_t FSize = I->Formulae.size();
2795    if (FSize >= ComplexityLimit) {
2796      Power = ComplexityLimit;
2797      break;
2798    }
2799    Power *= FSize;
2800    if (Power >= ComplexityLimit)
2801      break;
2802  }
2803  return Power;
2804}
2805
2806/// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of
2807/// formulae to choose from, use some rough heuristics to prune down the number
2808/// of formulae. This keeps the main solver from taking an extraordinary amount
2809/// of time in some worst-case scenarios.
2810void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
2811  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
2812    DEBUG(dbgs() << "The search space is too complex.\n");
2813
2814    DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
2815                    "which use a superset of registers used by other "
2816                    "formulae.\n");
2817
2818    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2819      LSRUse &LU = Uses[LUIdx];
2820      bool Any = false;
2821      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
2822        Formula &F = LU.Formulae[i];
2823        for (SmallVectorImpl<const SCEV *>::const_iterator
2824             I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
2825          if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
2826            Formula NewF = F;
2827            NewF.AM.BaseOffs += C->getValue()->getSExtValue();
2828            NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
2829                                (I - F.BaseRegs.begin()));
2830            if (LU.HasFormulaWithSameRegs(NewF)) {
2831              DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
2832              LU.DeleteFormula(F);
2833              --i;
2834              --e;
2835              Any = true;
2836              break;
2837            }
2838          } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
2839            if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
2840              if (!F.AM.BaseGV) {
2841                Formula NewF = F;
2842                NewF.AM.BaseGV = GV;
2843                NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
2844                                    (I - F.BaseRegs.begin()));
2845                if (LU.HasFormulaWithSameRegs(NewF)) {
2846                  DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
2847                        dbgs() << '\n');
2848                  LU.DeleteFormula(F);
2849                  --i;
2850                  --e;
2851                  Any = true;
2852                  break;
2853                }
2854              }
2855          }
2856        }
2857      }
2858      if (Any)
2859        LU.RecomputeRegs(LUIdx, RegUses);
2860    }
2861
2862    DEBUG(dbgs() << "After pre-selection:\n";
2863          print_uses(dbgs()));
2864  }
2865
2866  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
2867    DEBUG(dbgs() << "The search space is too complex.\n");
2868
2869    DEBUG(dbgs() << "Narrowing the search space by assuming that uses "
2870                    "separated by a constant offset will use the same "
2871                    "registers.\n");
2872
2873    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2874      LSRUse &LU = Uses[LUIdx];
2875      for (size_t FIdx = 0, NumForms = LU.Formulae.size();
2876           FIdx != NumForms; ++FIdx) {
2877        Formula &F = LU.Formulae[FIdx];
2878        if (F.AM.BaseOffs != 0 && F.AM.Scale == 0) {
2879          if (LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU)) {
2880            if (reconcileNewOffset(*LUThatHas, F.AM.BaseOffs,
2881                                   /*HasBaseReg=*/false,
2882                                   LU.Kind, LU.AccessTy)) {
2883              DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs());
2884                    dbgs() << '\n');
2885
2886              LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
2887
2888              // Delete formulae from the new use which are no longer legal.
2889              bool Any = false;
2890              for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
2891                Formula &F = LUThatHas->Formulae[i];
2892                if (!isLegalUse(F.AM,
2893                                LUThatHas->MinOffset, LUThatHas->MaxOffset,
2894                                LUThatHas->Kind, LUThatHas->AccessTy, TLI)) {
2895                  DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
2896                        dbgs() << '\n');
2897                  LUThatHas->DeleteFormula(F);
2898                  --i;
2899                  --e;
2900                  Any = true;
2901                }
2902              }
2903              if (Any)
2904                LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
2905
2906              // Update the relocs to reference the new use.
2907              for (size_t i = 0, e = Fixups.size(); i != e; ++i) {
2908                if (Fixups[i].LUIdx == LUIdx) {
2909                  Fixups[i].LUIdx = LUThatHas - &Uses.front();
2910                  Fixups[i].Offset += F.AM.BaseOffs;
2911                  DEBUG(errs() << "New fixup has offset "
2912                               << Fixups[i].Offset << "\n");
2913                }
2914                if (Fixups[i].LUIdx == NumUses-1)
2915                  Fixups[i].LUIdx = LUIdx;
2916              }
2917
2918              // Delete the old use.
2919              DeleteUse(LU);
2920              --LUIdx;
2921              --NumUses;
2922              break;
2923            }
2924          }
2925        }
2926      }
2927    }
2928
2929    DEBUG(dbgs() << "After pre-selection:\n";
2930          print_uses(dbgs()));
2931  }
2932
2933  SmallPtrSet<const SCEV *, 4> Taken;
2934  while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
2935    // Ok, we have too many of formulae on our hands to conveniently handle.
2936    // Use a rough heuristic to thin out the list.
2937    DEBUG(dbgs() << "The search space is too complex.\n");
2938
2939    // Pick the register which is used by the most LSRUses, which is likely
2940    // to be a good reuse register candidate.
2941    const SCEV *Best = 0;
2942    unsigned BestNum = 0;
2943    for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
2944         I != E; ++I) {
2945      const SCEV *Reg = *I;
2946      if (Taken.count(Reg))
2947        continue;
2948      if (!Best)
2949        Best = Reg;
2950      else {
2951        unsigned Count = RegUses.getUsedByIndices(Reg).count();
2952        if (Count > BestNum) {
2953          Best = Reg;
2954          BestNum = Count;
2955        }
2956      }
2957    }
2958
2959    DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
2960                 << " will yield profitable reuse.\n");
2961    Taken.insert(Best);
2962
2963    // In any use with formulae which references this register, delete formulae
2964    // which don't reference it.
2965    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2966      LSRUse &LU = Uses[LUIdx];
2967      if (!LU.Regs.count(Best)) continue;
2968
2969      bool Any = false;
2970      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
2971        Formula &F = LU.Formulae[i];
2972        if (!F.referencesReg(Best)) {
2973          DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
2974          LU.DeleteFormula(F);
2975          --e;
2976          --i;
2977          Any = true;
2978          assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
2979          continue;
2980        }
2981      }
2982
2983      if (Any)
2984        LU.RecomputeRegs(LUIdx, RegUses);
2985    }
2986
2987    DEBUG(dbgs() << "After pre-selection:\n";
2988          print_uses(dbgs()));
2989  }
2990}
2991
2992/// SolveRecurse - This is the recursive solver.
2993void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2994                               Cost &SolutionCost,
2995                               SmallVectorImpl<const Formula *> &Workspace,
2996                               const Cost &CurCost,
2997                               const SmallPtrSet<const SCEV *, 16> &CurRegs,
2998                               DenseSet<const SCEV *> &VisitedRegs) const {
2999  // Some ideas:
3000  //  - prune more:
3001  //    - use more aggressive filtering
3002  //    - sort the formula so that the most profitable solutions are found first
3003  //    - sort the uses too
3004  //  - search faster:
3005  //    - don't compute a cost, and then compare. compare while computing a cost
3006  //      and bail early.
3007  //    - track register sets with SmallBitVector
3008
3009  const LSRUse &LU = Uses[Workspace.size()];
3010
3011  // If this use references any register that's already a part of the
3012  // in-progress solution, consider it a requirement that a formula must
3013  // reference that register in order to be considered. This prunes out
3014  // unprofitable searching.
3015  SmallSetVector<const SCEV *, 4> ReqRegs;
3016  for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(),
3017       E = CurRegs.end(); I != E; ++I)
3018    if (LU.Regs.count(*I))
3019      ReqRegs.insert(*I);
3020
3021  bool AnySatisfiedReqRegs = false;
3022  SmallPtrSet<const SCEV *, 16> NewRegs;
3023  Cost NewCost;
3024retry:
3025  for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
3026       E = LU.Formulae.end(); I != E; ++I) {
3027    const Formula &F = *I;
3028
3029    // Ignore formulae which do not use any of the required registers.
3030    for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(),
3031         JE = ReqRegs.end(); J != JE; ++J) {
3032      const SCEV *Reg = *J;
3033      if ((!F.ScaledReg || F.ScaledReg != Reg) &&
3034          std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) ==
3035          F.BaseRegs.end())
3036        goto skip;
3037    }
3038    AnySatisfiedReqRegs = true;
3039
3040    // Evaluate the cost of the current formula. If it's already worse than
3041    // the current best, prune the search at that point.
3042    NewCost = CurCost;
3043    NewRegs = CurRegs;
3044    NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT);
3045    if (NewCost < SolutionCost) {
3046      Workspace.push_back(&F);
3047      if (Workspace.size() != Uses.size()) {
3048        SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
3049                     NewRegs, VisitedRegs);
3050        if (F.getNumRegs() == 1 && Workspace.size() == 1)
3051          VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
3052      } else {
3053        DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
3054              dbgs() << ". Regs:";
3055              for (SmallPtrSet<const SCEV *, 16>::const_iterator
3056                   I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I)
3057                dbgs() << ' ' << **I;
3058              dbgs() << '\n');
3059
3060        SolutionCost = NewCost;
3061        Solution = Workspace;
3062      }
3063      Workspace.pop_back();
3064    }
3065  skip:;
3066  }
3067
3068  // If none of the formulae had all of the required registers, relax the
3069  // constraint so that we don't exclude all formulae.
3070  if (!AnySatisfiedReqRegs) {
3071    assert(!ReqRegs.empty() && "Solver failed even without required registers");
3072    ReqRegs.clear();
3073    goto retry;
3074  }
3075}
3076
3077void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
3078  SmallVector<const Formula *, 8> Workspace;
3079  Cost SolutionCost;
3080  SolutionCost.Loose();
3081  Cost CurCost;
3082  SmallPtrSet<const SCEV *, 16> CurRegs;
3083  DenseSet<const SCEV *> VisitedRegs;
3084  Workspace.reserve(Uses.size());
3085
3086  SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
3087               CurRegs, VisitedRegs);
3088
3089  // Ok, we've now made all our decisions.
3090  DEBUG(dbgs() << "\n"
3091                  "The chosen solution requires "; SolutionCost.print(dbgs());
3092        dbgs() << ":\n";
3093        for (size_t i = 0, e = Uses.size(); i != e; ++i) {
3094          dbgs() << "  ";
3095          Uses[i].print(dbgs());
3096          dbgs() << "\n"
3097                    "    ";
3098          Solution[i]->print(dbgs());
3099          dbgs() << '\n';
3100        });
3101}
3102
3103/// getImmediateDominator - A handy utility for the specific DominatorTree
3104/// query that we need here.
3105///
3106static BasicBlock *getImmediateDominator(BasicBlock *BB, DominatorTree &DT) {
3107  DomTreeNode *Node = DT.getNode(BB);
3108  if (!Node) return 0;
3109  Node = Node->getIDom();
3110  if (!Node) return 0;
3111  return Node->getBlock();
3112}
3113
3114/// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up
3115/// the dominator tree far as we can go while still being dominated by the
3116/// input positions. This helps canonicalize the insert position, which
3117/// encourages sharing.
3118BasicBlock::iterator
3119LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
3120                                 const SmallVectorImpl<Instruction *> &Inputs)
3121                                                                         const {
3122  for (;;) {
3123    const Loop *IPLoop = LI.getLoopFor(IP->getParent());
3124    unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
3125
3126    BasicBlock *IDom;
3127    for (BasicBlock *Rung = IP->getParent(); ; Rung = IDom) {
3128      IDom = getImmediateDominator(Rung, DT);
3129      if (!IDom) return IP;
3130
3131      // Don't climb into a loop though.
3132      const Loop *IDomLoop = LI.getLoopFor(IDom);
3133      unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
3134      if (IDomDepth <= IPLoopDepth &&
3135          (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
3136        break;
3137    }
3138
3139    bool AllDominate = true;
3140    Instruction *BetterPos = 0;
3141    Instruction *Tentative = IDom->getTerminator();
3142    for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),
3143         E = Inputs.end(); I != E; ++I) {
3144      Instruction *Inst = *I;
3145      if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
3146        AllDominate = false;
3147        break;
3148      }
3149      // Attempt to find an insert position in the middle of the block,
3150      // instead of at the end, so that it can be used for other expansions.
3151      if (IDom == Inst->getParent() &&
3152          (!BetterPos || DT.dominates(BetterPos, Inst)))
3153        BetterPos = llvm::next(BasicBlock::iterator(Inst));
3154    }
3155    if (!AllDominate)
3156      break;
3157    if (BetterPos)
3158      IP = BetterPos;
3159    else
3160      IP = Tentative;
3161  }
3162
3163  return IP;
3164}
3165
3166/// AdjustInsertPositionForExpand - Determine an input position which will be
3167/// dominated by the operands and which will dominate the result.
3168BasicBlock::iterator
3169LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP,
3170                                           const LSRFixup &LF,
3171                                           const LSRUse &LU) const {
3172  // Collect some instructions which must be dominated by the
3173  // expanding replacement. These must be dominated by any operands that
3174  // will be required in the expansion.
3175  SmallVector<Instruction *, 4> Inputs;
3176  if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
3177    Inputs.push_back(I);
3178  if (LU.Kind == LSRUse::ICmpZero)
3179    if (Instruction *I =
3180          dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
3181      Inputs.push_back(I);
3182  if (LF.PostIncLoops.count(L)) {
3183    if (LF.isUseFullyOutsideLoop(L))
3184      Inputs.push_back(L->getLoopLatch()->getTerminator());
3185    else
3186      Inputs.push_back(IVIncInsertPos);
3187  }
3188  // The expansion must also be dominated by the increment positions of any
3189  // loops it for which it is using post-inc mode.
3190  for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(),
3191       E = LF.PostIncLoops.end(); I != E; ++I) {
3192    const Loop *PIL = *I;
3193    if (PIL == L) continue;
3194
3195    // Be dominated by the loop exit.
3196    SmallVector<BasicBlock *, 4> ExitingBlocks;
3197    PIL->getExitingBlocks(ExitingBlocks);
3198    if (!ExitingBlocks.empty()) {
3199      BasicBlock *BB = ExitingBlocks[0];
3200      for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
3201        BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
3202      Inputs.push_back(BB->getTerminator());
3203    }
3204  }
3205
3206  // Then, climb up the immediate dominator tree as far as we can go while
3207  // still being dominated by the input positions.
3208  IP = HoistInsertPosition(IP, Inputs);
3209
3210  // Don't insert instructions before PHI nodes.
3211  while (isa<PHINode>(IP)) ++IP;
3212
3213  // Ignore debug intrinsics.
3214  while (isa<DbgInfoIntrinsic>(IP)) ++IP;
3215
3216  return IP;
3217}
3218
3219Value *LSRInstance::Expand(const LSRFixup &LF,
3220                           const Formula &F,
3221                           BasicBlock::iterator IP,
3222                           SCEVExpander &Rewriter,
3223                           SmallVectorImpl<WeakVH> &DeadInsts) const {
3224  const LSRUse &LU = Uses[LF.LUIdx];
3225
3226  // Determine an input position which will be dominated by the operands and
3227  // which will dominate the result.
3228  IP = AdjustInsertPositionForExpand(IP, LF, LU);
3229
3230  // Inform the Rewriter if we have a post-increment use, so that it can
3231  // perform an advantageous expansion.
3232  Rewriter.setPostInc(LF.PostIncLoops);
3233
3234  // This is the type that the user actually needs.
3235  const Type *OpTy = LF.OperandValToReplace->getType();
3236  // This will be the type that we'll initially expand to.
3237  const Type *Ty = F.getType();
3238  if (!Ty)
3239    // No type known; just expand directly to the ultimate type.
3240    Ty = OpTy;
3241  else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
3242    // Expand directly to the ultimate type if it's the right size.
3243    Ty = OpTy;
3244  // This is the type to do integer arithmetic in.
3245  const Type *IntTy = SE.getEffectiveSCEVType(Ty);
3246
3247  // Build up a list of operands to add together to form the full base.
3248  SmallVector<const SCEV *, 8> Ops;
3249
3250  // Expand the BaseRegs portion.
3251  for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
3252       E = F.BaseRegs.end(); I != E; ++I) {
3253    const SCEV *Reg = *I;
3254    assert(!Reg->isZero() && "Zero allocated in a base register!");
3255
3256    // If we're expanding for a post-inc user, make the post-inc adjustment.
3257    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
3258    Reg = TransformForPostIncUse(Denormalize, Reg,
3259                                 LF.UserInst, LF.OperandValToReplace,
3260                                 Loops, SE, DT);
3261
3262    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
3263  }
3264
3265  // Flush the operand list to suppress SCEVExpander hoisting.
3266  if (!Ops.empty()) {
3267    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
3268    Ops.clear();
3269    Ops.push_back(SE.getUnknown(FullV));
3270  }
3271
3272  // Expand the ScaledReg portion.
3273  Value *ICmpScaledV = 0;
3274  if (F.AM.Scale != 0) {
3275    const SCEV *ScaledS = F.ScaledReg;
3276
3277    // If we're expanding for a post-inc user, make the post-inc adjustment.
3278    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
3279    ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
3280                                     LF.UserInst, LF.OperandValToReplace,
3281                                     Loops, SE, DT);
3282
3283    if (LU.Kind == LSRUse::ICmpZero) {
3284      // An interesting way of "folding" with an icmp is to use a negated
3285      // scale, which we'll implement by inserting it into the other operand
3286      // of the icmp.
3287      assert(F.AM.Scale == -1 &&
3288             "The only scale supported by ICmpZero uses is -1!");
3289      ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP);
3290    } else {
3291      // Otherwise just expand the scaled register and an explicit scale,
3292      // which is expected to be matched as part of the address.
3293      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
3294      ScaledS = SE.getMulExpr(ScaledS,
3295                              SE.getConstant(ScaledS->getType(), F.AM.Scale));
3296      Ops.push_back(ScaledS);
3297
3298      // Flush the operand list to suppress SCEVExpander hoisting.
3299      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
3300      Ops.clear();
3301      Ops.push_back(SE.getUnknown(FullV));
3302    }
3303  }
3304
3305  // Expand the GV portion.
3306  if (F.AM.BaseGV) {
3307    Ops.push_back(SE.getUnknown(F.AM.BaseGV));
3308
3309    // Flush the operand list to suppress SCEVExpander hoisting.
3310    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
3311    Ops.clear();
3312    Ops.push_back(SE.getUnknown(FullV));
3313  }
3314
3315  // Expand the immediate portion.
3316  int64_t Offset = (uint64_t)F.AM.BaseOffs + LF.Offset;
3317  if (Offset != 0) {
3318    if (LU.Kind == LSRUse::ICmpZero) {
3319      // The other interesting way of "folding" with an ICmpZero is to use a
3320      // negated immediate.
3321      if (!ICmpScaledV)
3322        ICmpScaledV = ConstantInt::get(IntTy, -Offset);
3323      else {
3324        Ops.push_back(SE.getUnknown(ICmpScaledV));
3325        ICmpScaledV = ConstantInt::get(IntTy, Offset);
3326      }
3327    } else {
3328      // Just add the immediate values. These again are expected to be matched
3329      // as part of the address.
3330      Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
3331    }
3332  }
3333
3334  // Emit instructions summing all the operands.
3335  const SCEV *FullS = Ops.empty() ?
3336                      SE.getConstant(IntTy, 0) :
3337                      SE.getAddExpr(Ops);
3338  Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);
3339
3340  // We're done expanding now, so reset the rewriter.
3341  Rewriter.clearPostInc();
3342
3343  // An ICmpZero Formula represents an ICmp which we're handling as a
3344  // comparison against zero. Now that we've expanded an expression for that
3345  // form, update the ICmp's other operand.
3346  if (LU.Kind == LSRUse::ICmpZero) {
3347    ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
3348    DeadInsts.push_back(CI->getOperand(1));
3349    assert(!F.AM.BaseGV && "ICmp does not support folding a global value and "
3350                           "a scale at the same time!");
3351    if (F.AM.Scale == -1) {
3352      if (ICmpScaledV->getType() != OpTy) {
3353        Instruction *Cast =
3354          CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
3355                                                   OpTy, false),
3356                           ICmpScaledV, OpTy, "tmp", CI);
3357        ICmpScaledV = Cast;
3358      }
3359      CI->setOperand(1, ICmpScaledV);
3360    } else {
3361      assert(F.AM.Scale == 0 &&
3362             "ICmp does not support folding a global value and "
3363             "a scale at the same time!");
3364      Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
3365                                           -(uint64_t)Offset);
3366      if (C->getType() != OpTy)
3367        C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
3368                                                          OpTy, false),
3369                                  C, OpTy);
3370
3371      CI->setOperand(1, C);
3372    }
3373  }
3374
3375  return FullV;
3376}
3377
3378/// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use
3379/// of their operands effectively happens in their predecessor blocks, so the
3380/// expression may need to be expanded in multiple places.
3381void LSRInstance::RewriteForPHI(PHINode *PN,
3382                                const LSRFixup &LF,
3383                                const Formula &F,
3384                                SCEVExpander &Rewriter,
3385                                SmallVectorImpl<WeakVH> &DeadInsts,
3386                                Pass *P) const {
3387  DenseMap<BasicBlock *, Value *> Inserted;
3388  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
3389    if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
3390      BasicBlock *BB = PN->getIncomingBlock(i);
3391
3392      // If this is a critical edge, split the edge so that we do not insert
3393      // the code on all predecessor/successor paths.  We do this unless this
3394      // is the canonical backedge for this loop, which complicates post-inc
3395      // users.
3396      if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
3397          !isa<IndirectBrInst>(BB->getTerminator()) &&
3398          (PN->getParent() != L->getHeader() || !L->contains(BB))) {
3399        // Split the critical edge.
3400        BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P);
3401
3402        // If PN is outside of the loop and BB is in the loop, we want to
3403        // move the block to be immediately before the PHI block, not
3404        // immediately after BB.
3405        if (L->contains(BB) && !L->contains(PN))
3406          NewBB->moveBefore(PN->getParent());
3407
3408        // Splitting the edge can reduce the number of PHI entries we have.
3409        e = PN->getNumIncomingValues();
3410        BB = NewBB;
3411        i = PN->getBasicBlockIndex(BB);
3412      }
3413
3414      std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
3415        Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
3416      if (!Pair.second)
3417        PN->setIncomingValue(i, Pair.first->second);
3418      else {
3419        Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts);
3420
3421        // If this is reuse-by-noop-cast, insert the noop cast.
3422        const Type *OpTy = LF.OperandValToReplace->getType();
3423        if (FullV->getType() != OpTy)
3424          FullV =
3425            CastInst::Create(CastInst::getCastOpcode(FullV, false,
3426                                                     OpTy, false),
3427                             FullV, LF.OperandValToReplace->getType(),
3428                             "tmp", BB->getTerminator());
3429
3430        PN->setIncomingValue(i, FullV);
3431        Pair.first->second = FullV;
3432      }
3433    }
3434}
3435
3436/// Rewrite - Emit instructions for the leading candidate expression for this
3437/// LSRUse (this is called "expanding"), and update the UserInst to reference
3438/// the newly expanded value.
3439void LSRInstance::Rewrite(const LSRFixup &LF,
3440                          const Formula &F,
3441                          SCEVExpander &Rewriter,
3442                          SmallVectorImpl<WeakVH> &DeadInsts,
3443                          Pass *P) const {
3444  // First, find an insertion point that dominates UserInst. For PHI nodes,
3445  // find the nearest block which dominates all the relevant uses.
3446  if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
3447    RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P);
3448  } else {
3449    Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts);
3450
3451    // If this is reuse-by-noop-cast, insert the noop cast.
3452    const Type *OpTy = LF.OperandValToReplace->getType();
3453    if (FullV->getType() != OpTy) {
3454      Instruction *Cast =
3455        CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
3456                         FullV, OpTy, "tmp", LF.UserInst);
3457      FullV = Cast;
3458    }
3459
3460    // Update the user. ICmpZero is handled specially here (for now) because
3461    // Expand may have updated one of the operands of the icmp already, and
3462    // its new value may happen to be equal to LF.OperandValToReplace, in
3463    // which case doing replaceUsesOfWith leads to replacing both operands
3464    // with the same value. TODO: Reorganize this.
3465    if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)
3466      LF.UserInst->setOperand(0, FullV);
3467    else
3468      LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
3469  }
3470
3471  DeadInsts.push_back(LF.OperandValToReplace);
3472}
3473
3474void
3475LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
3476                               Pass *P) {
3477  // Keep track of instructions we may have made dead, so that
3478  // we can remove them after we are done working.
3479  SmallVector<WeakVH, 16> DeadInsts;
3480
3481  SCEVExpander Rewriter(SE);
3482  Rewriter.disableCanonicalMode();
3483  Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
3484
3485  // Expand the new value definitions and update the users.
3486  for (size_t i = 0, e = Fixups.size(); i != e; ++i) {
3487    size_t LUIdx = Fixups[i].LUIdx;
3488
3489    Rewrite(Fixups[i], *Solution[LUIdx], Rewriter, DeadInsts, P);
3490
3491    Changed = true;
3492  }
3493
3494  // Clean up after ourselves. This must be done before deleting any
3495  // instructions.
3496  Rewriter.clear();
3497
3498  Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
3499}
3500
3501LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
3502  : IU(P->getAnalysis<IVUsers>()),
3503    SE(P->getAnalysis<ScalarEvolution>()),
3504    DT(P->getAnalysis<DominatorTree>()),
3505    LI(P->getAnalysis<LoopInfo>()),
3506    TLI(tli), L(l), Changed(false), IVIncInsertPos(0) {
3507
3508  // If LoopSimplify form is not available, stay out of trouble.
3509  if (!L->isLoopSimplifyForm()) return;
3510
3511  // If there's no interesting work to be done, bail early.
3512  if (IU.empty()) return;
3513
3514  DEBUG(dbgs() << "\nLSR on loop ";
3515        WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
3516        dbgs() << ":\n");
3517
3518  /// OptimizeShadowIV - If IV is used in a int-to-float cast
3519  /// inside the loop then try to eliminate the cast operation.
3520  OptimizeShadowIV();
3521
3522  // Change loop terminating condition to use the postinc iv when possible.
3523  Changed |= OptimizeLoopTermCond();
3524
3525  CollectInterestingTypesAndFactors();
3526  CollectFixupsAndInitialFormulae();
3527  CollectLoopInvariantFixupsAndFormulae();
3528
3529  DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
3530        print_uses(dbgs()));
3531
3532  // Now use the reuse data to generate a bunch of interesting ways
3533  // to formulate the values needed for the uses.
3534  GenerateAllReuseFormulae();
3535
3536  DEBUG(dbgs() << "\n"
3537                  "After generating reuse formulae:\n";
3538        print_uses(dbgs()));
3539
3540  FilterOutUndesirableDedicatedRegisters();
3541  NarrowSearchSpaceUsingHeuristics();
3542
3543  SmallVector<const Formula *, 8> Solution;
3544  Solve(Solution);
3545  assert(Solution.size() == Uses.size() && "Malformed solution!");
3546
3547  // Release memory that is no longer needed.
3548  Factors.clear();
3549  Types.clear();
3550  RegUses.clear();
3551
3552#ifndef NDEBUG
3553  // Formulae should be legal.
3554  for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
3555       E = Uses.end(); I != E; ++I) {
3556     const LSRUse &LU = *I;
3557     for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
3558          JE = LU.Formulae.end(); J != JE; ++J)
3559        assert(isLegalUse(J->AM, LU.MinOffset, LU.MaxOffset,
3560                          LU.Kind, LU.AccessTy, TLI) &&
3561               "Illegal formula generated!");
3562  };
3563#endif
3564
3565  // Now that we've decided what we want, make it so.
3566  ImplementSolution(Solution, P);
3567}
3568
3569void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
3570  if (Factors.empty() && Types.empty()) return;
3571
3572  OS << "LSR has identified the following interesting factors and types: ";
3573  bool First = true;
3574
3575  for (SmallSetVector<int64_t, 8>::const_iterator
3576       I = Factors.begin(), E = Factors.end(); I != E; ++I) {
3577    if (!First) OS << ", ";
3578    First = false;
3579    OS << '*' << *I;
3580  }
3581
3582  for (SmallSetVector<const Type *, 4>::const_iterator
3583       I = Types.begin(), E = Types.end(); I != E; ++I) {
3584    if (!First) OS << ", ";
3585    First = false;
3586    OS << '(' << **I << ')';
3587  }
3588  OS << '\n';
3589}
3590
3591void LSRInstance::print_fixups(raw_ostream &OS) const {
3592  OS << "LSR is examining the following fixup sites:\n";
3593  for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
3594       E = Fixups.end(); I != E; ++I) {
3595    const LSRFixup &LF = *I;
3596    dbgs() << "  ";
3597    LF.print(OS);
3598    OS << '\n';
3599  }
3600}
3601
3602void LSRInstance::print_uses(raw_ostream &OS) const {
3603  OS << "LSR is examining the following uses:\n";
3604  for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
3605       E = Uses.end(); I != E; ++I) {
3606    const LSRUse &LU = *I;
3607    dbgs() << "  ";
3608    LU.print(OS);
3609    OS << '\n';
3610    for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
3611         JE = LU.Formulae.end(); J != JE; ++J) {
3612      OS << "    ";
3613      J->print(OS);
3614      OS << '\n';
3615    }
3616  }
3617}
3618
3619void LSRInstance::print(raw_ostream &OS) const {
3620  print_factors_and_types(OS);
3621  print_fixups(OS);
3622  print_uses(OS);
3623}
3624
3625void LSRInstance::dump() const {
3626  print(errs()); errs() << '\n';
3627}
3628
3629namespace {
3630
3631class LoopStrengthReduce : public LoopPass {
3632  /// TLI - Keep a pointer of a TargetLowering to consult for determining
3633  /// transformation profitability.
3634  const TargetLowering *const TLI;
3635
3636public:
3637  static char ID; // Pass ID, replacement for typeid
3638  explicit LoopStrengthReduce(const TargetLowering *tli = 0);
3639
3640private:
3641  bool runOnLoop(Loop *L, LPPassManager &LPM);
3642  void getAnalysisUsage(AnalysisUsage &AU) const;
3643};
3644
3645}
3646
3647char LoopStrengthReduce::ID = 0;
3648static RegisterPass<LoopStrengthReduce>
3649X("loop-reduce", "Loop Strength Reduction");
3650
3651Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
3652  return new LoopStrengthReduce(TLI);
3653}
3654
3655LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli)
3656  : LoopPass(&ID), TLI(tli) {}
3657
3658void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
3659  // We split critical edges, so we change the CFG.  However, we do update
3660  // many analyses if they are around.
3661  AU.addPreservedID(LoopSimplifyID);
3662  AU.addPreserved("domfrontier");
3663
3664  AU.addRequired<LoopInfo>();
3665  AU.addPreserved<LoopInfo>();
3666  AU.addRequiredID(LoopSimplifyID);
3667  AU.addRequired<DominatorTree>();
3668  AU.addPreserved<DominatorTree>();
3669  AU.addRequired<ScalarEvolution>();
3670  AU.addPreserved<ScalarEvolution>();
3671  AU.addRequired<IVUsers>();
3672  AU.addPreserved<IVUsers>();
3673}
3674
3675bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
3676  bool Changed = false;
3677
3678  // Run the main LSR transformation.
3679  Changed |= LSRInstance(TLI, L, this).getChanged();
3680
3681  // At this point, it is worth checking to see if any recurrence PHIs are also
3682  // dead, so that we can remove them as well.
3683  Changed |= DeleteDeadPHIs(L->getHeader());
3684
3685  return Changed;
3686}
3687