X86ISelDAGToDAG.cpp revision 9a508ef64a194f0f4a3362c55a6e33bec18b7554
1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines a DAG pattern matching instruction selector for X86,
11// converting from a legalized dag to a X86 dag.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "x86-isel"
16#include "X86.h"
17#include "X86InstrBuilder.h"
18#include "X86MachineFunctionInfo.h"
19#include "X86RegisterInfo.h"
20#include "X86Subtarget.h"
21#include "X86TargetMachine.h"
22#include "llvm/ADT/Statistic.h"
23#include "llvm/CodeGen/MachineFrameInfo.h"
24#include "llvm/CodeGen/MachineFunction.h"
25#include "llvm/CodeGen/MachineInstrBuilder.h"
26#include "llvm/CodeGen/MachineRegisterInfo.h"
27#include "llvm/CodeGen/SelectionDAGISel.h"
28#include "llvm/IR/Instructions.h"
29#include "llvm/IR/Intrinsics.h"
30#include "llvm/IR/Type.h"
31#include "llvm/Support/Debug.h"
32#include "llvm/Support/ErrorHandling.h"
33#include "llvm/Support/MathExtras.h"
34#include "llvm/Support/raw_ostream.h"
35#include "llvm/Target/TargetMachine.h"
36#include "llvm/Target/TargetOptions.h"
37using namespace llvm;
38
39STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
40
41//===----------------------------------------------------------------------===//
42//                      Pattern Matcher Implementation
43//===----------------------------------------------------------------------===//
44
45namespace {
46  /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses
47  /// SDValue's instead of register numbers for the leaves of the matched
48  /// tree.
49  struct X86ISelAddressMode {
50    enum {
51      RegBase,
52      FrameIndexBase
53    } BaseType;
54
55    // This is really a union, discriminated by BaseType!
56    SDValue Base_Reg;
57    int Base_FrameIndex;
58
59    unsigned Scale;
60    SDValue IndexReg;
61    int32_t Disp;
62    SDValue Segment;
63    const GlobalValue *GV;
64    const Constant *CP;
65    const BlockAddress *BlockAddr;
66    const char *ES;
67    int JT;
68    unsigned Align;    // CP alignment.
69    unsigned char SymbolFlags;  // X86II::MO_*
70
71    X86ISelAddressMode()
72      : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
73        Segment(), GV(0), CP(0), BlockAddr(0), ES(0), JT(-1), Align(0),
74        SymbolFlags(X86II::MO_NO_FLAG) {
75    }
76
77    bool hasSymbolicDisplacement() const {
78      return GV != 0 || CP != 0 || ES != 0 || JT != -1 || BlockAddr != 0;
79    }
80
81    bool hasBaseOrIndexReg() const {
82      return IndexReg.getNode() != 0 || Base_Reg.getNode() != 0;
83    }
84
85    /// isRIPRelative - Return true if this addressing mode is already RIP
86    /// relative.
87    bool isRIPRelative() const {
88      if (BaseType != RegBase) return false;
89      if (RegisterSDNode *RegNode =
90            dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
91        return RegNode->getReg() == X86::RIP;
92      return false;
93    }
94
95    void setBaseReg(SDValue Reg) {
96      BaseType = RegBase;
97      Base_Reg = Reg;
98    }
99
100#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
101    void dump() {
102      dbgs() << "X86ISelAddressMode " << this << '\n';
103      dbgs() << "Base_Reg ";
104      if (Base_Reg.getNode() != 0)
105        Base_Reg.getNode()->dump();
106      else
107        dbgs() << "nul";
108      dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'
109             << " Scale" << Scale << '\n'
110             << "IndexReg ";
111      if (IndexReg.getNode() != 0)
112        IndexReg.getNode()->dump();
113      else
114        dbgs() << "nul";
115      dbgs() << " Disp " << Disp << '\n'
116             << "GV ";
117      if (GV)
118        GV->dump();
119      else
120        dbgs() << "nul";
121      dbgs() << " CP ";
122      if (CP)
123        CP->dump();
124      else
125        dbgs() << "nul";
126      dbgs() << '\n'
127             << "ES ";
128      if (ES)
129        dbgs() << ES;
130      else
131        dbgs() << "nul";
132      dbgs() << " JT" << JT << " Align" << Align << '\n';
133    }
134#endif
135  };
136}
137
138namespace {
139  //===--------------------------------------------------------------------===//
140  /// ISel - X86 specific code to select X86 machine instructions for
141  /// SelectionDAG operations.
142  ///
143  class X86DAGToDAGISel : public SelectionDAGISel {
144    /// X86Lowering - This object fully describes how to lower LLVM code to an
145    /// X86-specific SelectionDAG.
146    const X86TargetLowering &X86Lowering;
147
148    /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
149    /// make the right decision when generating code for different targets.
150    const X86Subtarget *Subtarget;
151
152    /// OptForSize - If true, selector should try to optimize for code size
153    /// instead of performance.
154    bool OptForSize;
155
156  public:
157    explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
158      : SelectionDAGISel(tm, OptLevel),
159        X86Lowering(*tm.getTargetLowering()),
160        Subtarget(&tm.getSubtarget<X86Subtarget>()),
161        OptForSize(false) {}
162
163    virtual const char *getPassName() const {
164      return "X86 DAG->DAG Instruction Selection";
165    }
166
167    virtual void EmitFunctionEntryCode();
168
169    virtual bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const;
170
171    virtual void PreprocessISelDAG();
172
173    inline bool immSext8(SDNode *N) const {
174      return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue());
175    }
176
177    // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit
178    // sign extended field.
179    inline bool i64immSExt32(SDNode *N) const {
180      uint64_t v = cast<ConstantSDNode>(N)->getZExtValue();
181      return (int64_t)v == (int32_t)v;
182    }
183
184// Include the pieces autogenerated from the target description.
185#include "X86GenDAGISel.inc"
186
187  private:
188    SDNode *Select(SDNode *N);
189    SDNode *SelectGather(SDNode *N, unsigned Opc);
190    SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
191    SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT);
192
193    bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
194    bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
195    bool MatchWrapper(SDValue N, X86ISelAddressMode &AM);
196    bool MatchAddress(SDValue N, X86ISelAddressMode &AM);
197    bool MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
198                                 unsigned Depth);
199    bool MatchAddressBase(SDValue N, X86ISelAddressMode &AM);
200    bool SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
201                    SDValue &Scale, SDValue &Index, SDValue &Disp,
202                    SDValue &Segment);
203    bool SelectMOV64Imm32(SDValue N, SDValue &Imm);
204    bool SelectLEAAddr(SDValue N, SDValue &Base,
205                       SDValue &Scale, SDValue &Index, SDValue &Disp,
206                       SDValue &Segment);
207    bool SelectTLSADDRAddr(SDValue N, SDValue &Base,
208                           SDValue &Scale, SDValue &Index, SDValue &Disp,
209                           SDValue &Segment);
210    bool SelectScalarSSELoad(SDNode *Root, SDValue N,
211                             SDValue &Base, SDValue &Scale,
212                             SDValue &Index, SDValue &Disp,
213                             SDValue &Segment,
214                             SDValue &NodeWithChain);
215
216    bool TryFoldLoad(SDNode *P, SDValue N,
217                     SDValue &Base, SDValue &Scale,
218                     SDValue &Index, SDValue &Disp,
219                     SDValue &Segment);
220
221    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
222    /// inline asm expressions.
223    virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
224                                              char ConstraintCode,
225                                              std::vector<SDValue> &OutOps);
226
227    void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI);
228
229    inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base,
230                                   SDValue &Scale, SDValue &Index,
231                                   SDValue &Disp, SDValue &Segment) {
232      Base  = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
233        CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, TLI.getPointerTy()) :
234        AM.Base_Reg;
235      Scale = getI8Imm(AM.Scale);
236      Index = AM.IndexReg;
237      // These are 32-bit even in 64-bit mode since RIP relative offset
238      // is 32-bit.
239      if (AM.GV)
240        Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
241                                              MVT::i32, AM.Disp,
242                                              AM.SymbolFlags);
243      else if (AM.CP)
244        Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
245                                             AM.Align, AM.Disp, AM.SymbolFlags);
246      else if (AM.ES) {
247        assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
248        Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
249      } else if (AM.JT != -1) {
250        assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
251        Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
252      } else if (AM.BlockAddr)
253        Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
254                                             AM.SymbolFlags);
255      else
256        Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i32);
257
258      if (AM.Segment.getNode())
259        Segment = AM.Segment;
260      else
261        Segment = CurDAG->getRegister(0, MVT::i32);
262    }
263
264    /// getI8Imm - Return a target constant with the specified value, of type
265    /// i8.
266    inline SDValue getI8Imm(unsigned Imm) {
267      return CurDAG->getTargetConstant(Imm, MVT::i8);
268    }
269
270    /// getI32Imm - Return a target constant with the specified value, of type
271    /// i32.
272    inline SDValue getI32Imm(unsigned Imm) {
273      return CurDAG->getTargetConstant(Imm, MVT::i32);
274    }
275
276    /// getGlobalBaseReg - Return an SDNode that returns the value of
277    /// the global base register. Output instructions required to
278    /// initialize the global base register, if necessary.
279    ///
280    SDNode *getGlobalBaseReg();
281
282    /// getTargetMachine - Return a reference to the TargetMachine, casted
283    /// to the target-specific type.
284    const X86TargetMachine &getTargetMachine() const {
285      return static_cast<const X86TargetMachine &>(TM);
286    }
287
288    /// getInstrInfo - Return a reference to the TargetInstrInfo, casted
289    /// to the target-specific type.
290    const X86InstrInfo *getInstrInfo() const {
291      return getTargetMachine().getInstrInfo();
292    }
293  };
294}
295
296
297bool
298X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
299  if (OptLevel == CodeGenOpt::None) return false;
300
301  if (!N.hasOneUse())
302    return false;
303
304  if (N.getOpcode() != ISD::LOAD)
305    return true;
306
307  // If N is a load, do additional profitability checks.
308  if (U == Root) {
309    switch (U->getOpcode()) {
310    default: break;
311    case X86ISD::ADD:
312    case X86ISD::SUB:
313    case X86ISD::AND:
314    case X86ISD::XOR:
315    case X86ISD::OR:
316    case ISD::ADD:
317    case ISD::ADDC:
318    case ISD::ADDE:
319    case ISD::AND:
320    case ISD::OR:
321    case ISD::XOR: {
322      SDValue Op1 = U->getOperand(1);
323
324      // If the other operand is a 8-bit immediate we should fold the immediate
325      // instead. This reduces code size.
326      // e.g.
327      // movl 4(%esp), %eax
328      // addl $4, %eax
329      // vs.
330      // movl $4, %eax
331      // addl 4(%esp), %eax
332      // The former is 2 bytes shorter. In case where the increment is 1, then
333      // the saving can be 4 bytes (by using incl %eax).
334      if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1))
335        if (Imm->getAPIntValue().isSignedIntN(8))
336          return false;
337
338      // If the other operand is a TLS address, we should fold it instead.
339      // This produces
340      // movl    %gs:0, %eax
341      // leal    i@NTPOFF(%eax), %eax
342      // instead of
343      // movl    $i@NTPOFF, %eax
344      // addl    %gs:0, %eax
345      // if the block also has an access to a second TLS address this will save
346      // a load.
347      // FIXME: This is probably also true for non TLS addresses.
348      if (Op1.getOpcode() == X86ISD::Wrapper) {
349        SDValue Val = Op1.getOperand(0);
350        if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
351          return false;
352      }
353    }
354    }
355  }
356
357  return true;
358}
359
360/// MoveBelowCallOrigChain - Replace the original chain operand of the call with
361/// load's chain operand and move load below the call's chain operand.
362static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
363                               SDValue Call, SDValue OrigChain) {
364  SmallVector<SDValue, 8> Ops;
365  SDValue Chain = OrigChain.getOperand(0);
366  if (Chain.getNode() == Load.getNode())
367    Ops.push_back(Load.getOperand(0));
368  else {
369    assert(Chain.getOpcode() == ISD::TokenFactor &&
370           "Unexpected chain operand");
371    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
372      if (Chain.getOperand(i).getNode() == Load.getNode())
373        Ops.push_back(Load.getOperand(0));
374      else
375        Ops.push_back(Chain.getOperand(i));
376    SDValue NewChain =
377      CurDAG->getNode(ISD::TokenFactor, SDLoc(Load),
378                      MVT::Other, &Ops[0], Ops.size());
379    Ops.clear();
380    Ops.push_back(NewChain);
381  }
382  for (unsigned i = 1, e = OrigChain.getNumOperands(); i != e; ++i)
383    Ops.push_back(OrigChain.getOperand(i));
384  CurDAG->UpdateNodeOperands(OrigChain.getNode(), &Ops[0], Ops.size());
385  CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
386                             Load.getOperand(1), Load.getOperand(2));
387
388  unsigned NumOps = Call.getNode()->getNumOperands();
389  Ops.clear();
390  Ops.push_back(SDValue(Load.getNode(), 1));
391  for (unsigned i = 1, e = NumOps; i != e; ++i)
392    Ops.push_back(Call.getOperand(i));
393  CurDAG->UpdateNodeOperands(Call.getNode(), &Ops[0], NumOps);
394}
395
396/// isCalleeLoad - Return true if call address is a load and it can be
397/// moved below CALLSEQ_START and the chains leading up to the call.
398/// Return the CALLSEQ_START by reference as a second output.
399/// In the case of a tail call, there isn't a callseq node between the call
400/// chain and the load.
401static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
402  // The transformation is somewhat dangerous if the call's chain was glued to
403  // the call. After MoveBelowOrigChain the load is moved between the call and
404  // the chain, this can create a cycle if the load is not folded. So it is
405  // *really* important that we are sure the load will be folded.
406  if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
407    return false;
408  LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
409  if (!LD ||
410      LD->isVolatile() ||
411      LD->getAddressingMode() != ISD::UNINDEXED ||
412      LD->getExtensionType() != ISD::NON_EXTLOAD)
413    return false;
414
415  // Now let's find the callseq_start.
416  while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
417    if (!Chain.hasOneUse())
418      return false;
419    Chain = Chain.getOperand(0);
420  }
421
422  if (!Chain.getNumOperands())
423    return false;
424  // Since we are not checking for AA here, conservatively abort if the chain
425  // writes to memory. It's not safe to move the callee (a load) across a store.
426  if (isa<MemSDNode>(Chain.getNode()) &&
427      cast<MemSDNode>(Chain.getNode())->writeMem())
428    return false;
429  if (Chain.getOperand(0).getNode() == Callee.getNode())
430    return true;
431  if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
432      Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
433      Callee.getValue(1).hasOneUse())
434    return true;
435  return false;
436}
437
438void X86DAGToDAGISel::PreprocessISelDAG() {
439  // OptForSize is used in pattern predicates that isel is matching.
440  OptForSize = MF->getFunction()->getAttributes().
441    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
442
443  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
444       E = CurDAG->allnodes_end(); I != E; ) {
445    SDNode *N = I++;  // Preincrement iterator to avoid invalidation issues.
446
447    if (OptLevel != CodeGenOpt::None &&
448        // Only does this when target favors doesn't favor register indirect
449        // call.
450        ((N->getOpcode() == X86ISD::CALL && !Subtarget->callRegIndirect()) ||
451         (N->getOpcode() == X86ISD::TC_RETURN &&
452          // Only does this if load can be folded into TC_RETURN.
453          (Subtarget->is64Bit() ||
454           getTargetMachine().getRelocationModel() != Reloc::PIC_)))) {
455      /// Also try moving call address load from outside callseq_start to just
456      /// before the call to allow it to be folded.
457      ///
458      ///     [Load chain]
459      ///         ^
460      ///         |
461      ///       [Load]
462      ///       ^    ^
463      ///       |    |
464      ///      /      \--
465      ///     /          |
466      ///[CALLSEQ_START] |
467      ///     ^          |
468      ///     |          |
469      /// [LOAD/C2Reg]   |
470      ///     |          |
471      ///      \        /
472      ///       \      /
473      ///       [CALL]
474      bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
475      SDValue Chain = N->getOperand(0);
476      SDValue Load  = N->getOperand(1);
477      if (!isCalleeLoad(Load, Chain, HasCallSeq))
478        continue;
479      MoveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
480      ++NumLoadMoved;
481      continue;
482    }
483
484    // Lower fpround and fpextend nodes that target the FP stack to be store and
485    // load to the stack.  This is a gross hack.  We would like to simply mark
486    // these as being illegal, but when we do that, legalize produces these when
487    // it expands calls, then expands these in the same legalize pass.  We would
488    // like dag combine to be able to hack on these between the call expansion
489    // and the node legalization.  As such this pass basically does "really
490    // late" legalization of these inline with the X86 isel pass.
491    // FIXME: This should only happen when not compiled with -O0.
492    if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
493      continue;
494
495    EVT SrcVT = N->getOperand(0).getValueType();
496    EVT DstVT = N->getValueType(0);
497
498    // If any of the sources are vectors, no fp stack involved.
499    if (SrcVT.isVector() || DstVT.isVector())
500      continue;
501
502    // If the source and destination are SSE registers, then this is a legal
503    // conversion that should not be lowered.
504    bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT);
505    bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT);
506    if (SrcIsSSE && DstIsSSE)
507      continue;
508
509    if (!SrcIsSSE && !DstIsSSE) {
510      // If this is an FPStack extension, it is a noop.
511      if (N->getOpcode() == ISD::FP_EXTEND)
512        continue;
513      // If this is a value-preserving FPStack truncation, it is a noop.
514      if (N->getConstantOperandVal(1))
515        continue;
516    }
517
518    // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
519    // FPStack has extload and truncstore.  SSE can fold direct loads into other
520    // operations.  Based on this, decide what we want to do.
521    EVT MemVT;
522    if (N->getOpcode() == ISD::FP_ROUND)
523      MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
524    else
525      MemVT = SrcIsSSE ? SrcVT : DstVT;
526
527    SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
528    SDLoc dl(N);
529
530    // FIXME: optimize the case where the src/dest is a load or store?
531    SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl,
532                                          N->getOperand(0),
533                                          MemTmp, MachinePointerInfo(), MemVT,
534                                          false, false, 0);
535    SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
536                                        MachinePointerInfo(),
537                                        MemVT, false, false, 0);
538
539    // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
540    // extload we created.  This will cause general havok on the dag because
541    // anything below the conversion could be folded into other existing nodes.
542    // To avoid invalidating 'I', back it up to the convert node.
543    --I;
544    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
545
546    // Now that we did that, the node is dead.  Increment the iterator to the
547    // next node to process, then delete N.
548    ++I;
549    CurDAG->DeleteNode(N);
550  }
551}
552
553
554/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in
555/// the main function.
556void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB,
557                                             MachineFrameInfo *MFI) {
558  const TargetInstrInfo *TII = TM.getInstrInfo();
559  if (Subtarget->isTargetCygMing()) {
560    unsigned CallOp =
561      Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32;
562    BuildMI(BB, DebugLoc(),
563            TII->get(CallOp)).addExternalSymbol("__main");
564  }
565}
566
567void X86DAGToDAGISel::EmitFunctionEntryCode() {
568  // If this is main, emit special code for main.
569  if (const Function *Fn = MF->getFunction())
570    if (Fn->hasExternalLinkage() && Fn->getName() == "main")
571      EmitSpecialCodeForMain(MF->begin(), MF->getFrameInfo());
572}
573
574static bool isDispSafeForFrameIndex(int64_t Val) {
575  // On 64-bit platforms, we can run into an issue where a frame index
576  // includes a displacement that, when added to the explicit displacement,
577  // will overflow the displacement field. Assuming that the frame index
578  // displacement fits into a 31-bit integer  (which is only slightly more
579  // aggressive than the current fundamental assumption that it fits into
580  // a 32-bit integer), a 31-bit disp should always be safe.
581  return isInt<31>(Val);
582}
583
584bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset,
585                                            X86ISelAddressMode &AM) {
586  int64_t Val = AM.Disp + Offset;
587  CodeModel::Model M = TM.getCodeModel();
588  if (Subtarget->is64Bit()) {
589    if (!X86::isOffsetSuitableForCodeModel(Val, M,
590                                           AM.hasSymbolicDisplacement()))
591      return true;
592    // In addition to the checks required for a register base, check that
593    // we do not try to use an unsafe Disp with a frame index.
594    if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
595        !isDispSafeForFrameIndex(Val))
596      return true;
597  }
598  AM.Disp = Val;
599  return false;
600
601}
602
603bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
604  SDValue Address = N->getOperand(1);
605
606  // load gs:0 -> GS segment register.
607  // load fs:0 -> FS segment register.
608  //
609  // This optimization is valid because the GNU TLS model defines that
610  // gs:0 (or fs:0 on X86-64) contains its own address.
611  // For more information see http://people.redhat.com/drepper/tls.pdf
612  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
613    if (C->getSExtValue() == 0 && AM.Segment.getNode() == 0 &&
614        Subtarget->isTargetLinux())
615      switch (N->getPointerInfo().getAddrSpace()) {
616      case 256:
617        AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
618        return false;
619      case 257:
620        AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
621        return false;
622      }
623
624  return true;
625}
626
627/// MatchWrapper - Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes
628/// into an addressing mode.  These wrap things that will resolve down into a
629/// symbol reference.  If no match is possible, this returns true, otherwise it
630/// returns false.
631bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
632  // If the addressing mode already has a symbol as the displacement, we can
633  // never match another symbol.
634  if (AM.hasSymbolicDisplacement())
635    return true;
636
637  SDValue N0 = N.getOperand(0);
638  CodeModel::Model M = TM.getCodeModel();
639
640  // Handle X86-64 rip-relative addresses.  We check this before checking direct
641  // folding because RIP is preferable to non-RIP accesses.
642  if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP &&
643      // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
644      // they cannot be folded into immediate fields.
645      // FIXME: This can be improved for kernel and other models?
646      (M == CodeModel::Small || M == CodeModel::Kernel)) {
647    // Base and index reg must be 0 in order to use %rip as base.
648    if (AM.hasBaseOrIndexReg())
649      return true;
650    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
651      X86ISelAddressMode Backup = AM;
652      AM.GV = G->getGlobal();
653      AM.SymbolFlags = G->getTargetFlags();
654      if (FoldOffsetIntoAddress(G->getOffset(), AM)) {
655        AM = Backup;
656        return true;
657      }
658    } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
659      X86ISelAddressMode Backup = AM;
660      AM.CP = CP->getConstVal();
661      AM.Align = CP->getAlignment();
662      AM.SymbolFlags = CP->getTargetFlags();
663      if (FoldOffsetIntoAddress(CP->getOffset(), AM)) {
664        AM = Backup;
665        return true;
666      }
667    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
668      AM.ES = S->getSymbol();
669      AM.SymbolFlags = S->getTargetFlags();
670    } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
671      AM.JT = J->getIndex();
672      AM.SymbolFlags = J->getTargetFlags();
673    } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
674      X86ISelAddressMode Backup = AM;
675      AM.BlockAddr = BA->getBlockAddress();
676      AM.SymbolFlags = BA->getTargetFlags();
677      if (FoldOffsetIntoAddress(BA->getOffset(), AM)) {
678        AM = Backup;
679        return true;
680      }
681    } else
682      llvm_unreachable("Unhandled symbol reference node.");
683
684    if (N.getOpcode() == X86ISD::WrapperRIP)
685      AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
686    return false;
687  }
688
689  // Handle the case when globals fit in our immediate field: This is true for
690  // X86-32 always and X86-64 when in -mcmodel=small mode.  In 64-bit
691  // mode, this only applies to a non-RIP-relative computation.
692  if (!Subtarget->is64Bit() ||
693      M == CodeModel::Small || M == CodeModel::Kernel) {
694    assert(N.getOpcode() != X86ISD::WrapperRIP &&
695           "RIP-relative addressing already handled");
696    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
697      AM.GV = G->getGlobal();
698      AM.Disp += G->getOffset();
699      AM.SymbolFlags = G->getTargetFlags();
700    } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
701      AM.CP = CP->getConstVal();
702      AM.Align = CP->getAlignment();
703      AM.Disp += CP->getOffset();
704      AM.SymbolFlags = CP->getTargetFlags();
705    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
706      AM.ES = S->getSymbol();
707      AM.SymbolFlags = S->getTargetFlags();
708    } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
709      AM.JT = J->getIndex();
710      AM.SymbolFlags = J->getTargetFlags();
711    } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
712      AM.BlockAddr = BA->getBlockAddress();
713      AM.Disp += BA->getOffset();
714      AM.SymbolFlags = BA->getTargetFlags();
715    } else
716      llvm_unreachable("Unhandled symbol reference node.");
717    return false;
718  }
719
720  return true;
721}
722
723/// MatchAddress - Add the specified node to the specified addressing mode,
724/// returning true if it cannot be done.  This just pattern matches for the
725/// addressing mode.
726bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
727  if (MatchAddressRecursively(N, AM, 0))
728    return true;
729
730  // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
731  // a smaller encoding and avoids a scaled-index.
732  if (AM.Scale == 2 &&
733      AM.BaseType == X86ISelAddressMode::RegBase &&
734      AM.Base_Reg.getNode() == 0) {
735    AM.Base_Reg = AM.IndexReg;
736    AM.Scale = 1;
737  }
738
739  // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
740  // because it has a smaller encoding.
741  // TODO: Which other code models can use this?
742  if (TM.getCodeModel() == CodeModel::Small &&
743      Subtarget->is64Bit() &&
744      AM.Scale == 1 &&
745      AM.BaseType == X86ISelAddressMode::RegBase &&
746      AM.Base_Reg.getNode() == 0 &&
747      AM.IndexReg.getNode() == 0 &&
748      AM.SymbolFlags == X86II::MO_NO_FLAG &&
749      AM.hasSymbolicDisplacement())
750    AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
751
752  return false;
753}
754
755// Insert a node into the DAG at least before the Pos node's position. This
756// will reposition the node as needed, and will assign it a node ID that is <=
757// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
758// IDs! The selection DAG must no longer depend on their uniqueness when this
759// is used.
760static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
761  if (N.getNode()->getNodeId() == -1 ||
762      N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) {
763    DAG.RepositionNode(Pos.getNode(), N.getNode());
764    N.getNode()->setNodeId(Pos.getNode()->getNodeId());
765  }
766}
767
768// Transform "(X >> (8-C1)) & C2" to "(X >> 8) & 0xff)" if safe. This
769// allows us to convert the shift and and into an h-register extract and
770// a scaled index. Returns false if the simplification is performed.
771static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
772                                      uint64_t Mask,
773                                      SDValue Shift, SDValue X,
774                                      X86ISelAddressMode &AM) {
775  if (Shift.getOpcode() != ISD::SRL ||
776      !isa<ConstantSDNode>(Shift.getOperand(1)) ||
777      !Shift.hasOneUse())
778    return true;
779
780  int ScaleLog = 8 - Shift.getConstantOperandVal(1);
781  if (ScaleLog <= 0 || ScaleLog >= 4 ||
782      Mask != (0xffu << ScaleLog))
783    return true;
784
785  EVT VT = N.getValueType();
786  SDLoc DL(N);
787  SDValue Eight = DAG.getConstant(8, MVT::i8);
788  SDValue NewMask = DAG.getConstant(0xff, VT);
789  SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
790  SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
791  SDValue ShlCount = DAG.getConstant(ScaleLog, MVT::i8);
792  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
793
794  // Insert the new nodes into the topological ordering. We must do this in
795  // a valid topological ordering as nothing is going to go back and re-sort
796  // these nodes. We continually insert before 'N' in sequence as this is
797  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
798  // hierarchy left to express.
799  InsertDAGNode(DAG, N, Eight);
800  InsertDAGNode(DAG, N, Srl);
801  InsertDAGNode(DAG, N, NewMask);
802  InsertDAGNode(DAG, N, And);
803  InsertDAGNode(DAG, N, ShlCount);
804  InsertDAGNode(DAG, N, Shl);
805  DAG.ReplaceAllUsesWith(N, Shl);
806  AM.IndexReg = And;
807  AM.Scale = (1 << ScaleLog);
808  return false;
809}
810
811// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
812// allows us to fold the shift into this addressing mode. Returns false if the
813// transform succeeded.
814static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
815                                        uint64_t Mask,
816                                        SDValue Shift, SDValue X,
817                                        X86ISelAddressMode &AM) {
818  if (Shift.getOpcode() != ISD::SHL ||
819      !isa<ConstantSDNode>(Shift.getOperand(1)))
820    return true;
821
822  // Not likely to be profitable if either the AND or SHIFT node has more
823  // than one use (unless all uses are for address computation). Besides,
824  // isel mechanism requires their node ids to be reused.
825  if (!N.hasOneUse() || !Shift.hasOneUse())
826    return true;
827
828  // Verify that the shift amount is something we can fold.
829  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
830  if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
831    return true;
832
833  EVT VT = N.getValueType();
834  SDLoc DL(N);
835  SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, VT);
836  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
837  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
838
839  // Insert the new nodes into the topological ordering. We must do this in
840  // a valid topological ordering as nothing is going to go back and re-sort
841  // these nodes. We continually insert before 'N' in sequence as this is
842  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
843  // hierarchy left to express.
844  InsertDAGNode(DAG, N, NewMask);
845  InsertDAGNode(DAG, N, NewAnd);
846  InsertDAGNode(DAG, N, NewShift);
847  DAG.ReplaceAllUsesWith(N, NewShift);
848
849  AM.Scale = 1 << ShiftAmt;
850  AM.IndexReg = NewAnd;
851  return false;
852}
853
854// Implement some heroics to detect shifts of masked values where the mask can
855// be replaced by extending the shift and undoing that in the addressing mode
856// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
857// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
858// the addressing mode. This results in code such as:
859//
860//   int f(short *y, int *lookup_table) {
861//     ...
862//     return *y + lookup_table[*y >> 11];
863//   }
864//
865// Turning into:
866//   movzwl (%rdi), %eax
867//   movl %eax, %ecx
868//   shrl $11, %ecx
869//   addl (%rsi,%rcx,4), %eax
870//
871// Instead of:
872//   movzwl (%rdi), %eax
873//   movl %eax, %ecx
874//   shrl $9, %ecx
875//   andl $124, %rcx
876//   addl (%rsi,%rcx), %eax
877//
878// Note that this function assumes the mask is provided as a mask *after* the
879// value is shifted. The input chain may or may not match that, but computing
880// such a mask is trivial.
881static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
882                                    uint64_t Mask,
883                                    SDValue Shift, SDValue X,
884                                    X86ISelAddressMode &AM) {
885  if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
886      !isa<ConstantSDNode>(Shift.getOperand(1)))
887    return true;
888
889  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
890  unsigned MaskLZ = countLeadingZeros(Mask);
891  unsigned MaskTZ = countTrailingZeros(Mask);
892
893  // The amount of shift we're trying to fit into the addressing mode is taken
894  // from the trailing zeros of the mask.
895  unsigned AMShiftAmt = MaskTZ;
896
897  // There is nothing we can do here unless the mask is removing some bits.
898  // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
899  if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
900
901  // We also need to ensure that mask is a continuous run of bits.
902  if (CountTrailingOnes_64(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
903
904  // Scale the leading zero count down based on the actual size of the value.
905  // Also scale it down based on the size of the shift.
906  MaskLZ -= (64 - X.getValueSizeInBits()) + ShiftAmt;
907
908  // The final check is to ensure that any masked out high bits of X are
909  // already known to be zero. Otherwise, the mask has a semantic impact
910  // other than masking out a couple of low bits. Unfortunately, because of
911  // the mask, zero extensions will be removed from operands in some cases.
912  // This code works extra hard to look through extensions because we can
913  // replace them with zero extensions cheaply if necessary.
914  bool ReplacingAnyExtend = false;
915  if (X.getOpcode() == ISD::ANY_EXTEND) {
916    unsigned ExtendBits =
917      X.getValueSizeInBits() - X.getOperand(0).getValueSizeInBits();
918    // Assume that we'll replace the any-extend with a zero-extend, and
919    // narrow the search to the extended value.
920    X = X.getOperand(0);
921    MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
922    ReplacingAnyExtend = true;
923  }
924  APInt MaskedHighBits = APInt::getHighBitsSet(X.getValueSizeInBits(),
925                                               MaskLZ);
926  APInt KnownZero, KnownOne;
927  DAG.ComputeMaskedBits(X, KnownZero, KnownOne);
928  if (MaskedHighBits != KnownZero) return true;
929
930  // We've identified a pattern that can be transformed into a single shift
931  // and an addressing mode. Make it so.
932  EVT VT = N.getValueType();
933  if (ReplacingAnyExtend) {
934    assert(X.getValueType() != VT);
935    // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
936    SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
937    InsertDAGNode(DAG, N, NewX);
938    X = NewX;
939  }
940  SDLoc DL(N);
941  SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, MVT::i8);
942  SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
943  SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, MVT::i8);
944  SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
945
946  // Insert the new nodes into the topological ordering. We must do this in
947  // a valid topological ordering as nothing is going to go back and re-sort
948  // these nodes. We continually insert before 'N' in sequence as this is
949  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
950  // hierarchy left to express.
951  InsertDAGNode(DAG, N, NewSRLAmt);
952  InsertDAGNode(DAG, N, NewSRL);
953  InsertDAGNode(DAG, N, NewSHLAmt);
954  InsertDAGNode(DAG, N, NewSHL);
955  DAG.ReplaceAllUsesWith(N, NewSHL);
956
957  AM.Scale = 1 << AMShiftAmt;
958  AM.IndexReg = NewSRL;
959  return false;
960}
961
962bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
963                                              unsigned Depth) {
964  SDLoc dl(N);
965  DEBUG({
966      dbgs() << "MatchAddress: ";
967      AM.dump();
968    });
969  // Limit recursion.
970  if (Depth > 5)
971    return MatchAddressBase(N, AM);
972
973  // If this is already a %rip relative address, we can only merge immediates
974  // into it.  Instead of handling this in every case, we handle it here.
975  // RIP relative addressing: %rip + 32-bit displacement!
976  if (AM.isRIPRelative()) {
977    // FIXME: JumpTable and ExternalSymbol address currently don't like
978    // displacements.  It isn't very important, but this should be fixed for
979    // consistency.
980    if (!AM.ES && AM.JT != -1) return true;
981
982    if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
983      if (!FoldOffsetIntoAddress(Cst->getSExtValue(), AM))
984        return false;
985    return true;
986  }
987
988  switch (N.getOpcode()) {
989  default: break;
990  case ISD::Constant: {
991    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
992    if (!FoldOffsetIntoAddress(Val, AM))
993      return false;
994    break;
995  }
996
997  case X86ISD::Wrapper:
998  case X86ISD::WrapperRIP:
999    if (!MatchWrapper(N, AM))
1000      return false;
1001    break;
1002
1003  case ISD::LOAD:
1004    if (!MatchLoadInAddress(cast<LoadSDNode>(N), AM))
1005      return false;
1006    break;
1007
1008  case ISD::FrameIndex:
1009    if (AM.BaseType == X86ISelAddressMode::RegBase &&
1010        AM.Base_Reg.getNode() == 0 &&
1011        (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
1012      AM.BaseType = X86ISelAddressMode::FrameIndexBase;
1013      AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
1014      return false;
1015    }
1016    break;
1017
1018  case ISD::SHL:
1019    if (AM.IndexReg.getNode() != 0 || AM.Scale != 1)
1020      break;
1021
1022    if (ConstantSDNode
1023          *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
1024      unsigned Val = CN->getZExtValue();
1025      // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
1026      // that the base operand remains free for further matching. If
1027      // the base doesn't end up getting used, a post-processing step
1028      // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
1029      if (Val == 1 || Val == 2 || Val == 3) {
1030        AM.Scale = 1 << Val;
1031        SDValue ShVal = N.getNode()->getOperand(0);
1032
1033        // Okay, we know that we have a scale by now.  However, if the scaled
1034        // value is an add of something and a constant, we can fold the
1035        // constant into the disp field here.
1036        if (CurDAG->isBaseWithConstantOffset(ShVal)) {
1037          AM.IndexReg = ShVal.getNode()->getOperand(0);
1038          ConstantSDNode *AddVal =
1039            cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
1040          uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
1041          if (!FoldOffsetIntoAddress(Disp, AM))
1042            return false;
1043        }
1044
1045        AM.IndexReg = ShVal;
1046        return false;
1047      }
1048    }
1049    break;
1050
1051  case ISD::SRL: {
1052    // Scale must not be used already.
1053    if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;
1054
1055    SDValue And = N.getOperand(0);
1056    if (And.getOpcode() != ISD::AND) break;
1057    SDValue X = And.getOperand(0);
1058
1059    // We only handle up to 64-bit values here as those are what matter for
1060    // addressing mode optimizations.
1061    if (X.getValueSizeInBits() > 64) break;
1062
1063    // The mask used for the transform is expected to be post-shift, but we
1064    // found the shift first so just apply the shift to the mask before passing
1065    // it down.
1066    if (!isa<ConstantSDNode>(N.getOperand(1)) ||
1067        !isa<ConstantSDNode>(And.getOperand(1)))
1068      break;
1069    uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
1070
1071    // Try to fold the mask and shift into the scale, and return false if we
1072    // succeed.
1073    if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
1074      return false;
1075    break;
1076  }
1077
1078  case ISD::SMUL_LOHI:
1079  case ISD::UMUL_LOHI:
1080    // A mul_lohi where we need the low part can be folded as a plain multiply.
1081    if (N.getResNo() != 0) break;
1082    // FALL THROUGH
1083  case ISD::MUL:
1084  case X86ISD::MUL_IMM:
1085    // X*[3,5,9] -> X+X*[2,4,8]
1086    if (AM.BaseType == X86ISelAddressMode::RegBase &&
1087        AM.Base_Reg.getNode() == 0 &&
1088        AM.IndexReg.getNode() == 0) {
1089      if (ConstantSDNode
1090            *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
1091        if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
1092            CN->getZExtValue() == 9) {
1093          AM.Scale = unsigned(CN->getZExtValue())-1;
1094
1095          SDValue MulVal = N.getNode()->getOperand(0);
1096          SDValue Reg;
1097
1098          // Okay, we know that we have a scale by now.  However, if the scaled
1099          // value is an add of something and a constant, we can fold the
1100          // constant into the disp field here.
1101          if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
1102              isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) {
1103            Reg = MulVal.getNode()->getOperand(0);
1104            ConstantSDNode *AddVal =
1105              cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
1106            uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
1107            if (FoldOffsetIntoAddress(Disp, AM))
1108              Reg = N.getNode()->getOperand(0);
1109          } else {
1110            Reg = N.getNode()->getOperand(0);
1111          }
1112
1113          AM.IndexReg = AM.Base_Reg = Reg;
1114          return false;
1115        }
1116    }
1117    break;
1118
1119  case ISD::SUB: {
1120    // Given A-B, if A can be completely folded into the address and
1121    // the index field with the index field unused, use -B as the index.
1122    // This is a win if a has multiple parts that can be folded into
1123    // the address. Also, this saves a mov if the base register has
1124    // other uses, since it avoids a two-address sub instruction, however
1125    // it costs an additional mov if the index register has other uses.
1126
1127    // Add an artificial use to this node so that we can keep track of
1128    // it if it gets CSE'd with a different node.
1129    HandleSDNode Handle(N);
1130
1131    // Test if the LHS of the sub can be folded.
1132    X86ISelAddressMode Backup = AM;
1133    if (MatchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
1134      AM = Backup;
1135      break;
1136    }
1137    // Test if the index field is free for use.
1138    if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
1139      AM = Backup;
1140      break;
1141    }
1142
1143    int Cost = 0;
1144    SDValue RHS = Handle.getValue().getNode()->getOperand(1);
1145    // If the RHS involves a register with multiple uses, this
1146    // transformation incurs an extra mov, due to the neg instruction
1147    // clobbering its operand.
1148    if (!RHS.getNode()->hasOneUse() ||
1149        RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
1150        RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
1151        RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
1152        (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
1153         RHS.getNode()->getOperand(0).getValueType() == MVT::i32))
1154      ++Cost;
1155    // If the base is a register with multiple uses, this
1156    // transformation may save a mov.
1157    if ((AM.BaseType == X86ISelAddressMode::RegBase &&
1158         AM.Base_Reg.getNode() &&
1159         !AM.Base_Reg.getNode()->hasOneUse()) ||
1160        AM.BaseType == X86ISelAddressMode::FrameIndexBase)
1161      --Cost;
1162    // If the folded LHS was interesting, this transformation saves
1163    // address arithmetic.
1164    if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
1165        ((AM.Disp != 0) && (Backup.Disp == 0)) +
1166        (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
1167      --Cost;
1168    // If it doesn't look like it may be an overall win, don't do it.
1169    if (Cost >= 0) {
1170      AM = Backup;
1171      break;
1172    }
1173
1174    // Ok, the transformation is legal and appears profitable. Go for it.
1175    SDValue Zero = CurDAG->getConstant(0, N.getValueType());
1176    SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
1177    AM.IndexReg = Neg;
1178    AM.Scale = 1;
1179
1180    // Insert the new nodes into the topological ordering.
1181    InsertDAGNode(*CurDAG, N, Zero);
1182    InsertDAGNode(*CurDAG, N, Neg);
1183    return false;
1184  }
1185
1186  case ISD::ADD: {
1187    // Add an artificial use to this node so that we can keep track of
1188    // it if it gets CSE'd with a different node.
1189    HandleSDNode Handle(N);
1190
1191    X86ISelAddressMode Backup = AM;
1192    if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1193        !MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
1194      return false;
1195    AM = Backup;
1196
1197    // Try again after commuting the operands.
1198    if (!MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)&&
1199        !MatchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
1200      return false;
1201    AM = Backup;
1202
1203    // If we couldn't fold both operands into the address at the same time,
1204    // see if we can just put each operand into a register and fold at least
1205    // the add.
1206    if (AM.BaseType == X86ISelAddressMode::RegBase &&
1207        !AM.Base_Reg.getNode() &&
1208        !AM.IndexReg.getNode()) {
1209      N = Handle.getValue();
1210      AM.Base_Reg = N.getOperand(0);
1211      AM.IndexReg = N.getOperand(1);
1212      AM.Scale = 1;
1213      return false;
1214    }
1215    N = Handle.getValue();
1216    break;
1217  }
1218
1219  case ISD::OR:
1220    // Handle "X | C" as "X + C" iff X is known to have C bits clear.
1221    if (CurDAG->isBaseWithConstantOffset(N)) {
1222      X86ISelAddressMode Backup = AM;
1223      ConstantSDNode *CN = cast<ConstantSDNode>(N.getOperand(1));
1224
1225      // Start with the LHS as an addr mode.
1226      if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1227          !FoldOffsetIntoAddress(CN->getSExtValue(), AM))
1228        return false;
1229      AM = Backup;
1230    }
1231    break;
1232
1233  case ISD::AND: {
1234    // Perform some heroic transforms on an and of a constant-count shift
1235    // with a constant to enable use of the scaled offset field.
1236
1237    // Scale must not be used already.
1238    if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;
1239
1240    SDValue Shift = N.getOperand(0);
1241    if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break;
1242    SDValue X = Shift.getOperand(0);
1243
1244    // We only handle up to 64-bit values here as those are what matter for
1245    // addressing mode optimizations.
1246    if (X.getValueSizeInBits() > 64) break;
1247
1248    if (!isa<ConstantSDNode>(N.getOperand(1)))
1249      break;
1250    uint64_t Mask = N.getConstantOperandVal(1);
1251
1252    // Try to fold the mask and shift into an extract and scale.
1253    if (!FoldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
1254      return false;
1255
1256    // Try to fold the mask and shift directly into the scale.
1257    if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
1258      return false;
1259
1260    // Try to swap the mask and shift to place shifts which can be done as
1261    // a scale on the outside of the mask.
1262    if (!FoldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
1263      return false;
1264    break;
1265  }
1266  }
1267
1268  return MatchAddressBase(N, AM);
1269}
1270
1271/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the
1272/// specified addressing mode without any further recursion.
1273bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) {
1274  // Is the base register already occupied?
1275  if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
1276    // If so, check to see if the scale index register is set.
1277    if (AM.IndexReg.getNode() == 0) {
1278      AM.IndexReg = N;
1279      AM.Scale = 1;
1280      return false;
1281    }
1282
1283    // Otherwise, we cannot select it.
1284    return true;
1285  }
1286
1287  // Default, generate it as a register.
1288  AM.BaseType = X86ISelAddressMode::RegBase;
1289  AM.Base_Reg = N;
1290  return false;
1291}
1292
1293/// SelectAddr - returns true if it is able pattern match an addressing mode.
1294/// It returns the operands which make up the maximal addressing mode it can
1295/// match by reference.
1296///
1297/// Parent is the parent node of the addr operand that is being matched.  It
1298/// is always a load, store, atomic node, or null.  It is only null when
1299/// checking memory operands for inline asm nodes.
1300bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
1301                                 SDValue &Scale, SDValue &Index,
1302                                 SDValue &Disp, SDValue &Segment) {
1303  X86ISelAddressMode AM;
1304
1305  if (Parent &&
1306      // This list of opcodes are all the nodes that have an "addr:$ptr" operand
1307      // that are not a MemSDNode, and thus don't have proper addrspace info.
1308      Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
1309      Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
1310      Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
1311      Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
1312      Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
1313    unsigned AddrSpace =
1314      cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
1315    // AddrSpace 256 -> GS, 257 -> FS.
1316    if (AddrSpace == 256)
1317      AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1318    if (AddrSpace == 257)
1319      AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1320  }
1321
1322  if (MatchAddress(N, AM))
1323    return false;
1324
1325  EVT VT = N.getValueType();
1326  if (AM.BaseType == X86ISelAddressMode::RegBase) {
1327    if (!AM.Base_Reg.getNode())
1328      AM.Base_Reg = CurDAG->getRegister(0, VT);
1329  }
1330
1331  if (!AM.IndexReg.getNode())
1332    AM.IndexReg = CurDAG->getRegister(0, VT);
1333
1334  getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
1335  return true;
1336}
1337
1338/// SelectScalarSSELoad - Match a scalar SSE load.  In particular, we want to
1339/// match a load whose top elements are either undef or zeros.  The load flavor
1340/// is derived from the type of N, which is either v4f32 or v2f64.
1341///
1342/// We also return:
1343///   PatternChainNode: this is the matched node that has a chain input and
1344///   output.
1345bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
1346                                          SDValue N, SDValue &Base,
1347                                          SDValue &Scale, SDValue &Index,
1348                                          SDValue &Disp, SDValue &Segment,
1349                                          SDValue &PatternNodeWithChain) {
1350  if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) {
1351    PatternNodeWithChain = N.getOperand(0);
1352    if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
1353        PatternNodeWithChain.hasOneUse() &&
1354        IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
1355        IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
1356      LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
1357      if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
1358        return false;
1359      return true;
1360    }
1361  }
1362
1363  // Also handle the case where we explicitly require zeros in the top
1364  // elements.  This is a vector shuffle from the zero vector.
1365  if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
1366      // Check to see if the top elements are all zeros (or bitcast of zeros).
1367      N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
1368      N.getOperand(0).getNode()->hasOneUse() &&
1369      ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).getNode()) &&
1370      N.getOperand(0).getOperand(0).hasOneUse() &&
1371      IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
1372      IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
1373    // Okay, this is a zero extending load.  Fold it.
1374    LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0));
1375    if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
1376      return false;
1377    PatternNodeWithChain = SDValue(LD, 0);
1378    return true;
1379  }
1380  return false;
1381}
1382
1383
1384bool X86DAGToDAGISel::SelectMOV64Imm32(SDValue N, SDValue &Imm) {
1385  if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
1386    uint64_t ImmVal = CN->getZExtValue();
1387    if ((uint32_t)ImmVal != (uint64_t)ImmVal)
1388      return false;
1389
1390    Imm = CurDAG->getTargetConstant(ImmVal, MVT::i64);
1391    return true;
1392  }
1393
1394  // In static codegen with small code model, we can get the address of a label
1395  // into a register with 'movl'. TableGen has already made sure we're looking
1396  // at a label of some kind.
1397  assert(N->getOpcode() == X86ISD::Wrapper && "Unexpected node type for MOV32ri64");
1398  N = N.getOperand(0);
1399
1400  if (N->getOpcode() != ISD::TargetConstantPool &&
1401      N->getOpcode() != ISD::TargetJumpTable &&
1402      N->getOpcode() != ISD::TargetGlobalAddress &&
1403      N->getOpcode() != ISD::TargetExternalSymbol &&
1404      N->getOpcode() != ISD::TargetBlockAddress)
1405    return false;
1406
1407  Imm = N;
1408  return TM.getCodeModel() == CodeModel::Small;
1409}
1410
1411/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing
1412/// mode it matches can be cost effectively emitted as an LEA instruction.
1413bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
1414                                    SDValue &Base, SDValue &Scale,
1415                                    SDValue &Index, SDValue &Disp,
1416                                    SDValue &Segment) {
1417  X86ISelAddressMode AM;
1418
1419  // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
1420  // segments.
1421  SDValue Copy = AM.Segment;
1422  SDValue T = CurDAG->getRegister(0, MVT::i32);
1423  AM.Segment = T;
1424  if (MatchAddress(N, AM))
1425    return false;
1426  assert (T == AM.Segment);
1427  AM.Segment = Copy;
1428
1429  EVT VT = N.getValueType();
1430  unsigned Complexity = 0;
1431  if (AM.BaseType == X86ISelAddressMode::RegBase)
1432    if (AM.Base_Reg.getNode())
1433      Complexity = 1;
1434    else
1435      AM.Base_Reg = CurDAG->getRegister(0, VT);
1436  else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
1437    Complexity = 4;
1438
1439  if (AM.IndexReg.getNode())
1440    Complexity++;
1441  else
1442    AM.IndexReg = CurDAG->getRegister(0, VT);
1443
1444  // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
1445  // a simple shift.
1446  if (AM.Scale > 1)
1447    Complexity++;
1448
1449  // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
1450  // to a LEA. This is determined with some expermentation but is by no means
1451  // optimal (especially for code size consideration). LEA is nice because of
1452  // its three-address nature. Tweak the cost function again when we can run
1453  // convertToThreeAddress() at register allocation time.
1454  if (AM.hasSymbolicDisplacement()) {
1455    // For X86-64, we should always use lea to materialize RIP relative
1456    // addresses.
1457    if (Subtarget->is64Bit())
1458      Complexity = 4;
1459    else
1460      Complexity += 2;
1461  }
1462
1463  if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode()))
1464    Complexity++;
1465
1466  // If it isn't worth using an LEA, reject it.
1467  if (Complexity <= 2)
1468    return false;
1469
1470  getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
1471  return true;
1472}
1473
1474/// SelectTLSADDRAddr - This is only run on TargetGlobalTLSAddress nodes.
1475bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base,
1476                                        SDValue &Scale, SDValue &Index,
1477                                        SDValue &Disp, SDValue &Segment) {
1478  assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
1479  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
1480
1481  X86ISelAddressMode AM;
1482  AM.GV = GA->getGlobal();
1483  AM.Disp += GA->getOffset();
1484  AM.Base_Reg = CurDAG->getRegister(0, N.getValueType());
1485  AM.SymbolFlags = GA->getTargetFlags();
1486
1487  if (N.getValueType() == MVT::i32) {
1488    AM.Scale = 1;
1489    AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
1490  } else {
1491    AM.IndexReg = CurDAG->getRegister(0, MVT::i64);
1492  }
1493
1494  getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
1495  return true;
1496}
1497
1498
1499bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
1500                                  SDValue &Base, SDValue &Scale,
1501                                  SDValue &Index, SDValue &Disp,
1502                                  SDValue &Segment) {
1503  if (!ISD::isNON_EXTLoad(N.getNode()) ||
1504      !IsProfitableToFold(N, P, P) ||
1505      !IsLegalToFold(N, P, P, OptLevel))
1506    return false;
1507
1508  return SelectAddr(N.getNode(),
1509                    N.getOperand(1), Base, Scale, Index, Disp, Segment);
1510}
1511
1512/// getGlobalBaseReg - Return an SDNode that returns the value of
1513/// the global base register. Output instructions required to
1514/// initialize the global base register, if necessary.
1515///
1516SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
1517  unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
1518  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
1519}
1520
1521SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
1522  SDValue Chain = Node->getOperand(0);
1523  SDValue In1 = Node->getOperand(1);
1524  SDValue In2L = Node->getOperand(2);
1525  SDValue In2H = Node->getOperand(3);
1526
1527  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
1528  if (!SelectAddr(Node, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
1529    return NULL;
1530  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
1531  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
1532  const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain};
1533  SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node),
1534                                           MVT::i32, MVT::i32, MVT::Other, Ops);
1535  cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
1536  return ResNode;
1537}
1538
1539/// Atomic opcode table
1540///
1541enum AtomicOpc {
1542  ADD,
1543  SUB,
1544  INC,
1545  DEC,
1546  OR,
1547  AND,
1548  XOR,
1549  AtomicOpcEnd
1550};
1551
1552enum AtomicSz {
1553  ConstantI8,
1554  I8,
1555  SextConstantI16,
1556  ConstantI16,
1557  I16,
1558  SextConstantI32,
1559  ConstantI32,
1560  I32,
1561  SextConstantI64,
1562  ConstantI64,
1563  I64,
1564  AtomicSzEnd
1565};
1566
1567static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
1568  {
1569    X86::LOCK_ADD8mi,
1570    X86::LOCK_ADD8mr,
1571    X86::LOCK_ADD16mi8,
1572    X86::LOCK_ADD16mi,
1573    X86::LOCK_ADD16mr,
1574    X86::LOCK_ADD32mi8,
1575    X86::LOCK_ADD32mi,
1576    X86::LOCK_ADD32mr,
1577    X86::LOCK_ADD64mi8,
1578    X86::LOCK_ADD64mi32,
1579    X86::LOCK_ADD64mr,
1580  },
1581  {
1582    X86::LOCK_SUB8mi,
1583    X86::LOCK_SUB8mr,
1584    X86::LOCK_SUB16mi8,
1585    X86::LOCK_SUB16mi,
1586    X86::LOCK_SUB16mr,
1587    X86::LOCK_SUB32mi8,
1588    X86::LOCK_SUB32mi,
1589    X86::LOCK_SUB32mr,
1590    X86::LOCK_SUB64mi8,
1591    X86::LOCK_SUB64mi32,
1592    X86::LOCK_SUB64mr,
1593  },
1594  {
1595    0,
1596    X86::LOCK_INC8m,
1597    0,
1598    0,
1599    X86::LOCK_INC16m,
1600    0,
1601    0,
1602    X86::LOCK_INC32m,
1603    0,
1604    0,
1605    X86::LOCK_INC64m,
1606  },
1607  {
1608    0,
1609    X86::LOCK_DEC8m,
1610    0,
1611    0,
1612    X86::LOCK_DEC16m,
1613    0,
1614    0,
1615    X86::LOCK_DEC32m,
1616    0,
1617    0,
1618    X86::LOCK_DEC64m,
1619  },
1620  {
1621    X86::LOCK_OR8mi,
1622    X86::LOCK_OR8mr,
1623    X86::LOCK_OR16mi8,
1624    X86::LOCK_OR16mi,
1625    X86::LOCK_OR16mr,
1626    X86::LOCK_OR32mi8,
1627    X86::LOCK_OR32mi,
1628    X86::LOCK_OR32mr,
1629    X86::LOCK_OR64mi8,
1630    X86::LOCK_OR64mi32,
1631    X86::LOCK_OR64mr,
1632  },
1633  {
1634    X86::LOCK_AND8mi,
1635    X86::LOCK_AND8mr,
1636    X86::LOCK_AND16mi8,
1637    X86::LOCK_AND16mi,
1638    X86::LOCK_AND16mr,
1639    X86::LOCK_AND32mi8,
1640    X86::LOCK_AND32mi,
1641    X86::LOCK_AND32mr,
1642    X86::LOCK_AND64mi8,
1643    X86::LOCK_AND64mi32,
1644    X86::LOCK_AND64mr,
1645  },
1646  {
1647    X86::LOCK_XOR8mi,
1648    X86::LOCK_XOR8mr,
1649    X86::LOCK_XOR16mi8,
1650    X86::LOCK_XOR16mi,
1651    X86::LOCK_XOR16mr,
1652    X86::LOCK_XOR32mi8,
1653    X86::LOCK_XOR32mi,
1654    X86::LOCK_XOR32mr,
1655    X86::LOCK_XOR64mi8,
1656    X86::LOCK_XOR64mi32,
1657    X86::LOCK_XOR64mr,
1658  }
1659};
1660
1661// Return the target constant operand for atomic-load-op and do simple
1662// translations, such as from atomic-load-add to lock-sub. The return value is
1663// one of the following 3 cases:
1664// + target-constant, the operand could be supported as a target constant.
1665// + empty, the operand is not needed any more with the new op selected.
1666// + non-empty, otherwise.
1667static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
1668                                                SDLoc dl,
1669                                                enum AtomicOpc &Op, EVT NVT,
1670                                                SDValue Val) {
1671  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) {
1672    int64_t CNVal = CN->getSExtValue();
1673    // Quit if not 32-bit imm.
1674    if ((int32_t)CNVal != CNVal)
1675      return Val;
1676    // For atomic-load-add, we could do some optimizations.
1677    if (Op == ADD) {
1678      // Translate to INC/DEC if ADD by 1 or -1.
1679      if ((CNVal == 1) || (CNVal == -1)) {
1680        Op = (CNVal == 1) ? INC : DEC;
1681        // No more constant operand after being translated into INC/DEC.
1682        return SDValue();
1683      }
1684      // Translate to SUB if ADD by negative value.
1685      if (CNVal < 0) {
1686        Op = SUB;
1687        CNVal = -CNVal;
1688      }
1689    }
1690    return CurDAG->getTargetConstant(CNVal, NVT);
1691  }
1692
1693  // If the value operand is single-used, try to optimize it.
1694  if (Op == ADD && Val.hasOneUse()) {
1695    // Translate (atomic-load-add ptr (sub 0 x)) back to (lock-sub x).
1696    if (Val.getOpcode() == ISD::SUB && X86::isZeroNode(Val.getOperand(0))) {
1697      Op = SUB;
1698      return Val.getOperand(1);
1699    }
1700    // A special case for i16, which needs truncating as, in most cases, it's
1701    // promoted to i32. We will translate
1702    // (atomic-load-add (truncate (sub 0 x))) to (lock-sub (EXTRACT_SUBREG x))
1703    if (Val.getOpcode() == ISD::TRUNCATE && NVT == MVT::i16 &&
1704        Val.getOperand(0).getOpcode() == ISD::SUB &&
1705        X86::isZeroNode(Val.getOperand(0).getOperand(0))) {
1706      Op = SUB;
1707      Val = Val.getOperand(0);
1708      return CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, NVT,
1709                                            Val.getOperand(1));
1710    }
1711  }
1712
1713  return Val;
1714}
1715
1716SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
1717  if (Node->hasAnyUseOfValue(0))
1718    return 0;
1719
1720  SDLoc dl(Node);
1721
1722  // Optimize common patterns for __sync_or_and_fetch and similar arith
1723  // operations where the result is not used. This allows us to use the "lock"
1724  // version of the arithmetic instruction.
1725  SDValue Chain = Node->getOperand(0);
1726  SDValue Ptr = Node->getOperand(1);
1727  SDValue Val = Node->getOperand(2);
1728  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
1729  if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
1730    return 0;
1731
1732  // Which index into the table.
1733  enum AtomicOpc Op;
1734  switch (Node->getOpcode()) {
1735    default:
1736      return 0;
1737    case ISD::ATOMIC_LOAD_OR:
1738      Op = OR;
1739      break;
1740    case ISD::ATOMIC_LOAD_AND:
1741      Op = AND;
1742      break;
1743    case ISD::ATOMIC_LOAD_XOR:
1744      Op = XOR;
1745      break;
1746    case ISD::ATOMIC_LOAD_ADD:
1747      Op = ADD;
1748      break;
1749  }
1750
1751  Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val);
1752  bool isUnOp = !Val.getNode();
1753  bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant);
1754
1755  unsigned Opc = 0;
1756  switch (NVT.getSimpleVT().SimpleTy) {
1757    default: return 0;
1758    case MVT::i8:
1759      if (isCN)
1760        Opc = AtomicOpcTbl[Op][ConstantI8];
1761      else
1762        Opc = AtomicOpcTbl[Op][I8];
1763      break;
1764    case MVT::i16:
1765      if (isCN) {
1766        if (immSext8(Val.getNode()))
1767          Opc = AtomicOpcTbl[Op][SextConstantI16];
1768        else
1769          Opc = AtomicOpcTbl[Op][ConstantI16];
1770      } else
1771        Opc = AtomicOpcTbl[Op][I16];
1772      break;
1773    case MVT::i32:
1774      if (isCN) {
1775        if (immSext8(Val.getNode()))
1776          Opc = AtomicOpcTbl[Op][SextConstantI32];
1777        else
1778          Opc = AtomicOpcTbl[Op][ConstantI32];
1779      } else
1780        Opc = AtomicOpcTbl[Op][I32];
1781      break;
1782    case MVT::i64:
1783      Opc = AtomicOpcTbl[Op][I64];
1784      if (isCN) {
1785        if (immSext8(Val.getNode()))
1786          Opc = AtomicOpcTbl[Op][SextConstantI64];
1787        else if (i64immSExt32(Val.getNode()))
1788          Opc = AtomicOpcTbl[Op][ConstantI64];
1789      }
1790      break;
1791  }
1792
1793  assert(Opc != 0 && "Invalid arith lock transform!");
1794
1795  SDValue Ret;
1796  SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
1797                                                 dl, NVT), 0);
1798  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
1799  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
1800  if (isUnOp) {
1801    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain };
1802    Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0);
1803  } else {
1804    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
1805    Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0);
1806  }
1807  cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
1808  SDValue RetVals[] = { Undef, Ret };
1809  return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
1810}
1811
1812/// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has
1813/// any uses which require the SF or OF bits to be accurate.
1814static bool HasNoSignedComparisonUses(SDNode *N) {
1815  // Examine each user of the node.
1816  for (SDNode::use_iterator UI = N->use_begin(),
1817         UE = N->use_end(); UI != UE; ++UI) {
1818    // Only examine CopyToReg uses.
1819    if (UI->getOpcode() != ISD::CopyToReg)
1820      return false;
1821    // Only examine CopyToReg uses that copy to EFLAGS.
1822    if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() !=
1823          X86::EFLAGS)
1824      return false;
1825    // Examine each user of the CopyToReg use.
1826    for (SDNode::use_iterator FlagUI = UI->use_begin(),
1827           FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
1828      // Only examine the Flag result.
1829      if (FlagUI.getUse().getResNo() != 1) continue;
1830      // Anything unusual: assume conservatively.
1831      if (!FlagUI->isMachineOpcode()) return false;
1832      // Examine the opcode of the user.
1833      switch (FlagUI->getMachineOpcode()) {
1834      // These comparisons don't treat the most significant bit specially.
1835      case X86::SETAr: case X86::SETAEr: case X86::SETBr: case X86::SETBEr:
1836      case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr:
1837      case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm:
1838      case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm:
1839      case X86::JA_4: case X86::JAE_4: case X86::JB_4: case X86::JBE_4:
1840      case X86::JE_4: case X86::JNE_4: case X86::JP_4: case X86::JNP_4:
1841      case X86::CMOVA16rr: case X86::CMOVA16rm:
1842      case X86::CMOVA32rr: case X86::CMOVA32rm:
1843      case X86::CMOVA64rr: case X86::CMOVA64rm:
1844      case X86::CMOVAE16rr: case X86::CMOVAE16rm:
1845      case X86::CMOVAE32rr: case X86::CMOVAE32rm:
1846      case X86::CMOVAE64rr: case X86::CMOVAE64rm:
1847      case X86::CMOVB16rr: case X86::CMOVB16rm:
1848      case X86::CMOVB32rr: case X86::CMOVB32rm:
1849      case X86::CMOVB64rr: case X86::CMOVB64rm:
1850      case X86::CMOVBE16rr: case X86::CMOVBE16rm:
1851      case X86::CMOVBE32rr: case X86::CMOVBE32rm:
1852      case X86::CMOVBE64rr: case X86::CMOVBE64rm:
1853      case X86::CMOVE16rr: case X86::CMOVE16rm:
1854      case X86::CMOVE32rr: case X86::CMOVE32rm:
1855      case X86::CMOVE64rr: case X86::CMOVE64rm:
1856      case X86::CMOVNE16rr: case X86::CMOVNE16rm:
1857      case X86::CMOVNE32rr: case X86::CMOVNE32rm:
1858      case X86::CMOVNE64rr: case X86::CMOVNE64rm:
1859      case X86::CMOVNP16rr: case X86::CMOVNP16rm:
1860      case X86::CMOVNP32rr: case X86::CMOVNP32rm:
1861      case X86::CMOVNP64rr: case X86::CMOVNP64rm:
1862      case X86::CMOVP16rr: case X86::CMOVP16rm:
1863      case X86::CMOVP32rr: case X86::CMOVP32rm:
1864      case X86::CMOVP64rr: case X86::CMOVP64rm:
1865        continue;
1866      // Anything else: assume conservatively.
1867      default: return false;
1868      }
1869    }
1870  }
1871  return true;
1872}
1873
1874/// isLoadIncOrDecStore - Check whether or not the chain ending in StoreNode
1875/// is suitable for doing the {load; increment or decrement; store} to modify
1876/// transformation.
1877static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
1878                                SDValue StoredVal, SelectionDAG *CurDAG,
1879                                LoadSDNode* &LoadNode, SDValue &InputChain) {
1880
1881  // is the value stored the result of a DEC or INC?
1882  if (!(Opc == X86ISD::DEC || Opc == X86ISD::INC)) return false;
1883
1884  // is the stored value result 0 of the load?
1885  if (StoredVal.getResNo() != 0) return false;
1886
1887  // are there other uses of the loaded value than the inc or dec?
1888  if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
1889
1890  // is the store non-extending and non-indexed?
1891  if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
1892    return false;
1893
1894  SDValue Load = StoredVal->getOperand(0);
1895  // Is the stored value a non-extending and non-indexed load?
1896  if (!ISD::isNormalLoad(Load.getNode())) return false;
1897
1898  // Return LoadNode by reference.
1899  LoadNode = cast<LoadSDNode>(Load);
1900  // is the size of the value one that we can handle? (i.e. 64, 32, 16, or 8)
1901  EVT LdVT = LoadNode->getMemoryVT();
1902  if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 &&
1903      LdVT != MVT::i8)
1904    return false;
1905
1906  // Is store the only read of the loaded value?
1907  if (!Load.hasOneUse())
1908    return false;
1909
1910  // Is the address of the store the same as the load?
1911  if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
1912      LoadNode->getOffset() != StoreNode->getOffset())
1913    return false;
1914
1915  // Check if the chain is produced by the load or is a TokenFactor with
1916  // the load output chain as an operand. Return InputChain by reference.
1917  SDValue Chain = StoreNode->getChain();
1918
1919  bool ChainCheck = false;
1920  if (Chain == Load.getValue(1)) {
1921    ChainCheck = true;
1922    InputChain = LoadNode->getChain();
1923  } else if (Chain.getOpcode() == ISD::TokenFactor) {
1924    SmallVector<SDValue, 4> ChainOps;
1925    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
1926      SDValue Op = Chain.getOperand(i);
1927      if (Op == Load.getValue(1)) {
1928        ChainCheck = true;
1929        continue;
1930      }
1931
1932      // Make sure using Op as part of the chain would not cause a cycle here.
1933      // In theory, we could check whether the chain node is a predecessor of
1934      // the load. But that can be very expensive. Instead visit the uses and
1935      // make sure they all have smaller node id than the load.
1936      int LoadId = LoadNode->getNodeId();
1937      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
1938             UE = UI->use_end(); UI != UE; ++UI) {
1939        if (UI.getUse().getResNo() != 0)
1940          continue;
1941        if (UI->getNodeId() > LoadId)
1942          return false;
1943      }
1944
1945      ChainOps.push_back(Op);
1946    }
1947
1948    if (ChainCheck)
1949      // Make a new TokenFactor with all the other input chains except
1950      // for the load.
1951      InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
1952                                   MVT::Other, &ChainOps[0], ChainOps.size());
1953  }
1954  if (!ChainCheck)
1955    return false;
1956
1957  return true;
1958}
1959
1960/// getFusedLdStOpcode - Get the appropriate X86 opcode for an in memory
1961/// increment or decrement. Opc should be X86ISD::DEC or X86ISD::INC.
1962static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
1963  if (Opc == X86ISD::DEC) {
1964    if (LdVT == MVT::i64) return X86::DEC64m;
1965    if (LdVT == MVT::i32) return X86::DEC32m;
1966    if (LdVT == MVT::i16) return X86::DEC16m;
1967    if (LdVT == MVT::i8)  return X86::DEC8m;
1968  } else {
1969    assert(Opc == X86ISD::INC && "unrecognized opcode");
1970    if (LdVT == MVT::i64) return X86::INC64m;
1971    if (LdVT == MVT::i32) return X86::INC32m;
1972    if (LdVT == MVT::i16) return X86::INC16m;
1973    if (LdVT == MVT::i8)  return X86::INC8m;
1974  }
1975  llvm_unreachable("unrecognized size for LdVT");
1976}
1977
1978/// SelectGather - Customized ISel for GATHER operations.
1979///
1980SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
1981  // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
1982  SDValue Chain = Node->getOperand(0);
1983  SDValue VSrc = Node->getOperand(2);
1984  SDValue Base = Node->getOperand(3);
1985  SDValue VIdx = Node->getOperand(4);
1986  SDValue VMask = Node->getOperand(5);
1987  ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
1988  if (!Scale)
1989    return 0;
1990
1991  SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
1992                                   MVT::Other);
1993
1994  // Memory Operands: Base, Scale, Index, Disp, Segment
1995  SDValue Disp = CurDAG->getTargetConstant(0, MVT::i32);
1996  SDValue Segment = CurDAG->getRegister(0, MVT::i32);
1997  const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue()), VIdx,
1998                          Disp, Segment, VMask, Chain};
1999  SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node), VTs, Ops);
2000  // Node has 2 outputs: VDst and MVT::Other.
2001  // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
2002  // We replace VDst of Node with VDst of ResNode, and Other of Node with Other
2003  // of ResNode.
2004  ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
2005  ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2));
2006  return ResNode;
2007}
2008
2009SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
2010  EVT NVT = Node->getValueType(0);
2011  unsigned Opc, MOpc;
2012  unsigned Opcode = Node->getOpcode();
2013  SDLoc dl(Node);
2014
2015  DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
2016
2017  if (Node->isMachineOpcode()) {
2018    DEBUG(dbgs() << "== ";  Node->dump(CurDAG); dbgs() << '\n');
2019    return NULL;   // Already selected.
2020  }
2021
2022  switch (Opcode) {
2023  default: break;
2024  case ISD::INTRINSIC_W_CHAIN: {
2025    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
2026    switch (IntNo) {
2027    default: break;
2028    case Intrinsic::x86_avx2_gather_d_pd:
2029    case Intrinsic::x86_avx2_gather_d_pd_256:
2030    case Intrinsic::x86_avx2_gather_q_pd:
2031    case Intrinsic::x86_avx2_gather_q_pd_256:
2032    case Intrinsic::x86_avx2_gather_d_ps:
2033    case Intrinsic::x86_avx2_gather_d_ps_256:
2034    case Intrinsic::x86_avx2_gather_q_ps:
2035    case Intrinsic::x86_avx2_gather_q_ps_256:
2036    case Intrinsic::x86_avx2_gather_d_q:
2037    case Intrinsic::x86_avx2_gather_d_q_256:
2038    case Intrinsic::x86_avx2_gather_q_q:
2039    case Intrinsic::x86_avx2_gather_q_q_256:
2040    case Intrinsic::x86_avx2_gather_d_d:
2041    case Intrinsic::x86_avx2_gather_d_d_256:
2042    case Intrinsic::x86_avx2_gather_q_d:
2043    case Intrinsic::x86_avx2_gather_q_d_256: {
2044      if (!Subtarget->hasAVX2())
2045        break;
2046      unsigned Opc;
2047      switch (IntNo) {
2048      default: llvm_unreachable("Impossible intrinsic");
2049      case Intrinsic::x86_avx2_gather_d_pd:     Opc = X86::VGATHERDPDrm;  break;
2050      case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break;
2051      case Intrinsic::x86_avx2_gather_q_pd:     Opc = X86::VGATHERQPDrm;  break;
2052      case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break;
2053      case Intrinsic::x86_avx2_gather_d_ps:     Opc = X86::VGATHERDPSrm;  break;
2054      case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break;
2055      case Intrinsic::x86_avx2_gather_q_ps:     Opc = X86::VGATHERQPSrm;  break;
2056      case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break;
2057      case Intrinsic::x86_avx2_gather_d_q:      Opc = X86::VPGATHERDQrm;  break;
2058      case Intrinsic::x86_avx2_gather_d_q_256:  Opc = X86::VPGATHERDQYrm; break;
2059      case Intrinsic::x86_avx2_gather_q_q:      Opc = X86::VPGATHERQQrm;  break;
2060      case Intrinsic::x86_avx2_gather_q_q_256:  Opc = X86::VPGATHERQQYrm; break;
2061      case Intrinsic::x86_avx2_gather_d_d:      Opc = X86::VPGATHERDDrm;  break;
2062      case Intrinsic::x86_avx2_gather_d_d_256:  Opc = X86::VPGATHERDDYrm; break;
2063      case Intrinsic::x86_avx2_gather_q_d:      Opc = X86::VPGATHERQDrm;  break;
2064      case Intrinsic::x86_avx2_gather_q_d_256:  Opc = X86::VPGATHERQDYrm; break;
2065      }
2066      SDNode *RetVal = SelectGather(Node, Opc);
2067      if (RetVal)
2068        // We already called ReplaceUses inside SelectGather.
2069        return NULL;
2070      break;
2071    }
2072    }
2073    break;
2074  }
2075  case X86ISD::GlobalBaseReg:
2076    return getGlobalBaseReg();
2077
2078
2079  case X86ISD::ATOMOR64_DAG:
2080  case X86ISD::ATOMXOR64_DAG:
2081  case X86ISD::ATOMADD64_DAG:
2082  case X86ISD::ATOMSUB64_DAG:
2083  case X86ISD::ATOMNAND64_DAG:
2084  case X86ISD::ATOMAND64_DAG:
2085  case X86ISD::ATOMMAX64_DAG:
2086  case X86ISD::ATOMMIN64_DAG:
2087  case X86ISD::ATOMUMAX64_DAG:
2088  case X86ISD::ATOMUMIN64_DAG:
2089  case X86ISD::ATOMSWAP64_DAG: {
2090    unsigned Opc;
2091    switch (Opcode) {
2092    default: llvm_unreachable("Impossible opcode");
2093    case X86ISD::ATOMOR64_DAG:   Opc = X86::ATOMOR6432;   break;
2094    case X86ISD::ATOMXOR64_DAG:  Opc = X86::ATOMXOR6432;  break;
2095    case X86ISD::ATOMADD64_DAG:  Opc = X86::ATOMADD6432;  break;
2096    case X86ISD::ATOMSUB64_DAG:  Opc = X86::ATOMSUB6432;  break;
2097    case X86ISD::ATOMNAND64_DAG: Opc = X86::ATOMNAND6432; break;
2098    case X86ISD::ATOMAND64_DAG:  Opc = X86::ATOMAND6432;  break;
2099    case X86ISD::ATOMMAX64_DAG:  Opc = X86::ATOMMAX6432;  break;
2100    case X86ISD::ATOMMIN64_DAG:  Opc = X86::ATOMMIN6432;  break;
2101    case X86ISD::ATOMUMAX64_DAG: Opc = X86::ATOMUMAX6432; break;
2102    case X86ISD::ATOMUMIN64_DAG: Opc = X86::ATOMUMIN6432; break;
2103    case X86ISD::ATOMSWAP64_DAG: Opc = X86::ATOMSWAP6432; break;
2104    }
2105    SDNode *RetVal = SelectAtomic64(Node, Opc);
2106    if (RetVal)
2107      return RetVal;
2108    break;
2109  }
2110
2111  case ISD::ATOMIC_LOAD_XOR:
2112  case ISD::ATOMIC_LOAD_AND:
2113  case ISD::ATOMIC_LOAD_OR:
2114  case ISD::ATOMIC_LOAD_ADD: {
2115    SDNode *RetVal = SelectAtomicLoadArith(Node, NVT);
2116    if (RetVal)
2117      return RetVal;
2118    break;
2119  }
2120  case ISD::AND:
2121  case ISD::OR:
2122  case ISD::XOR: {
2123    // For operations of the form (x << C1) op C2, check if we can use a smaller
2124    // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
2125    SDValue N0 = Node->getOperand(0);
2126    SDValue N1 = Node->getOperand(1);
2127
2128    if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse())
2129      break;
2130
2131    // i8 is unshrinkable, i16 should be promoted to i32.
2132    if (NVT != MVT::i32 && NVT != MVT::i64)
2133      break;
2134
2135    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
2136    ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
2137    if (!Cst || !ShlCst)
2138      break;
2139
2140    int64_t Val = Cst->getSExtValue();
2141    uint64_t ShlVal = ShlCst->getZExtValue();
2142
2143    // Make sure that we don't change the operation by removing bits.
2144    // This only matters for OR and XOR, AND is unaffected.
2145    uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1;
2146    if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
2147      break;
2148
2149    unsigned ShlOp, Op;
2150    EVT CstVT = NVT;
2151
2152    // Check the minimum bitwidth for the new constant.
2153    // TODO: AND32ri is the same as AND64ri32 with zext imm.
2154    // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
2155    // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
2156    if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal))
2157      CstVT = MVT::i8;
2158    else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal))
2159      CstVT = MVT::i32;
2160
2161    // Bail if there is no smaller encoding.
2162    if (NVT == CstVT)
2163      break;
2164
2165    switch (NVT.getSimpleVT().SimpleTy) {
2166    default: llvm_unreachable("Unsupported VT!");
2167    case MVT::i32:
2168      assert(CstVT == MVT::i8);
2169      ShlOp = X86::SHL32ri;
2170
2171      switch (Opcode) {
2172      default: llvm_unreachable("Impossible opcode");
2173      case ISD::AND: Op = X86::AND32ri8; break;
2174      case ISD::OR:  Op =  X86::OR32ri8; break;
2175      case ISD::XOR: Op = X86::XOR32ri8; break;
2176      }
2177      break;
2178    case MVT::i64:
2179      assert(CstVT == MVT::i8 || CstVT == MVT::i32);
2180      ShlOp = X86::SHL64ri;
2181
2182      switch (Opcode) {
2183      default: llvm_unreachable("Impossible opcode");
2184      case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
2185      case ISD::OR:  Op = CstVT==MVT::i8?  X86::OR64ri8 :  X86::OR64ri32; break;
2186      case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
2187      }
2188      break;
2189    }
2190
2191    // Emit the smaller op and the shift.
2192    SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, CstVT);
2193    SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
2194    return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
2195                                getI8Imm(ShlVal));
2196  }
2197  case X86ISD::UMUL: {
2198    SDValue N0 = Node->getOperand(0);
2199    SDValue N1 = Node->getOperand(1);
2200
2201    unsigned LoReg;
2202    switch (NVT.getSimpleVT().SimpleTy) {
2203    default: llvm_unreachable("Unsupported VT!");
2204    case MVT::i8:  LoReg = X86::AL;  Opc = X86::MUL8r; break;
2205    case MVT::i16: LoReg = X86::AX;  Opc = X86::MUL16r; break;
2206    case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break;
2207    case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break;
2208    }
2209
2210    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
2211                                          N0, SDValue()).getValue(1);
2212
2213    SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
2214    SDValue Ops[] = {N1, InFlag};
2215    SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
2216
2217    ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
2218    ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1));
2219    ReplaceUses(SDValue(Node, 2), SDValue(CNode, 2));
2220    return NULL;
2221  }
2222
2223  case ISD::SMUL_LOHI:
2224  case ISD::UMUL_LOHI: {
2225    SDValue N0 = Node->getOperand(0);
2226    SDValue N1 = Node->getOperand(1);
2227
2228    bool isSigned = Opcode == ISD::SMUL_LOHI;
2229    bool hasBMI2 = Subtarget->hasBMI2();
2230    if (!isSigned) {
2231      switch (NVT.getSimpleVT().SimpleTy) {
2232      default: llvm_unreachable("Unsupported VT!");
2233      case MVT::i8:  Opc = X86::MUL8r;  MOpc = X86::MUL8m;  break;
2234      case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
2235      case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r;
2236                     MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break;
2237      case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r;
2238                     MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break;
2239      }
2240    } else {
2241      switch (NVT.getSimpleVT().SimpleTy) {
2242      default: llvm_unreachable("Unsupported VT!");
2243      case MVT::i8:  Opc = X86::IMUL8r;  MOpc = X86::IMUL8m;  break;
2244      case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
2245      case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
2246      case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
2247      }
2248    }
2249
2250    unsigned SrcReg, LoReg, HiReg;
2251    switch (Opc) {
2252    default: llvm_unreachable("Unknown MUL opcode!");
2253    case X86::IMUL8r:
2254    case X86::MUL8r:
2255      SrcReg = LoReg = X86::AL; HiReg = X86::AH;
2256      break;
2257    case X86::IMUL16r:
2258    case X86::MUL16r:
2259      SrcReg = LoReg = X86::AX; HiReg = X86::DX;
2260      break;
2261    case X86::IMUL32r:
2262    case X86::MUL32r:
2263      SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
2264      break;
2265    case X86::IMUL64r:
2266    case X86::MUL64r:
2267      SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
2268      break;
2269    case X86::MULX32rr:
2270      SrcReg = X86::EDX; LoReg = HiReg = 0;
2271      break;
2272    case X86::MULX64rr:
2273      SrcReg = X86::RDX; LoReg = HiReg = 0;
2274      break;
2275    }
2276
2277    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
2278    bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
2279    // Multiply is commmutative.
2280    if (!foldedLoad) {
2281      foldedLoad = TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
2282      if (foldedLoad)
2283        std::swap(N0, N1);
2284    }
2285
2286    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
2287                                          N0, SDValue()).getValue(1);
2288    SDValue ResHi, ResLo;
2289
2290    if (foldedLoad) {
2291      SDValue Chain;
2292      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
2293                        InFlag };
2294      if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) {
2295        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue);
2296        SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
2297        ResHi = SDValue(CNode, 0);
2298        ResLo = SDValue(CNode, 1);
2299        Chain = SDValue(CNode, 2);
2300        InFlag = SDValue(CNode, 3);
2301      } else {
2302        SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2303        SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
2304        Chain = SDValue(CNode, 0);
2305        InFlag = SDValue(CNode, 1);
2306      }
2307
2308      // Update the chain.
2309      ReplaceUses(N1.getValue(1), Chain);
2310    } else {
2311      SDValue Ops[] = { N1, InFlag };
2312      if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) {
2313        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue);
2314        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
2315        ResHi = SDValue(CNode, 0);
2316        ResLo = SDValue(CNode, 1);
2317        InFlag = SDValue(CNode, 2);
2318      } else {
2319        SDVTList VTs = CurDAG->getVTList(MVT::Glue);
2320        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
2321        InFlag = SDValue(CNode, 0);
2322      }
2323    }
2324
2325    // Prevent use of AH in a REX instruction by referencing AX instead.
2326    if (HiReg == X86::AH && Subtarget->is64Bit() &&
2327        !SDValue(Node, 1).use_empty()) {
2328      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
2329                                              X86::AX, MVT::i16, InFlag);
2330      InFlag = Result.getValue(2);
2331      // Get the low part if needed. Don't use getCopyFromReg for aliasing
2332      // registers.
2333      if (!SDValue(Node, 0).use_empty())
2334        ReplaceUses(SDValue(Node, 1),
2335          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
2336
2337      // Shift AX down 8 bits.
2338      Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
2339                                              Result,
2340                                     CurDAG->getTargetConstant(8, MVT::i8)), 0);
2341      // Then truncate it down to i8.
2342      ReplaceUses(SDValue(Node, 1),
2343        CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
2344    }
2345    // Copy the low half of the result, if it is needed.
2346    if (!SDValue(Node, 0).use_empty()) {
2347      if (ResLo.getNode() == 0) {
2348        assert(LoReg && "Register for low half is not defined!");
2349        ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT,
2350                                       InFlag);
2351        InFlag = ResLo.getValue(2);
2352      }
2353      ReplaceUses(SDValue(Node, 0), ResLo);
2354      DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n');
2355    }
2356    // Copy the high half of the result, if it is needed.
2357    if (!SDValue(Node, 1).use_empty()) {
2358      if (ResHi.getNode() == 0) {
2359        assert(HiReg && "Register for high half is not defined!");
2360        ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT,
2361                                       InFlag);
2362        InFlag = ResHi.getValue(2);
2363      }
2364      ReplaceUses(SDValue(Node, 1), ResHi);
2365      DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
2366    }
2367
2368    return NULL;
2369  }
2370
2371  case ISD::SDIVREM:
2372  case ISD::UDIVREM: {
2373    SDValue N0 = Node->getOperand(0);
2374    SDValue N1 = Node->getOperand(1);
2375
2376    bool isSigned = Opcode == ISD::SDIVREM;
2377    if (!isSigned) {
2378      switch (NVT.getSimpleVT().SimpleTy) {
2379      default: llvm_unreachable("Unsupported VT!");
2380      case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
2381      case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
2382      case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
2383      case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
2384      }
2385    } else {
2386      switch (NVT.getSimpleVT().SimpleTy) {
2387      default: llvm_unreachable("Unsupported VT!");
2388      case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
2389      case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
2390      case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
2391      case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
2392      }
2393    }
2394
2395    unsigned LoReg, HiReg, ClrReg;
2396    unsigned SExtOpcode;
2397    switch (NVT.getSimpleVT().SimpleTy) {
2398    default: llvm_unreachable("Unsupported VT!");
2399    case MVT::i8:
2400      LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
2401      SExtOpcode = X86::CBW;
2402      break;
2403    case MVT::i16:
2404      LoReg = X86::AX;  HiReg = X86::DX;
2405      ClrReg = X86::DX;
2406      SExtOpcode = X86::CWD;
2407      break;
2408    case MVT::i32:
2409      LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
2410      SExtOpcode = X86::CDQ;
2411      break;
2412    case MVT::i64:
2413      LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
2414      SExtOpcode = X86::CQO;
2415      break;
2416    }
2417
2418    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
2419    bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
2420    bool signBitIsZero = CurDAG->SignBitIsZero(N0);
2421
2422    SDValue InFlag;
2423    if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
2424      // Special case for div8, just use a move with zero extension to AX to
2425      // clear the upper 8 bits (AH).
2426      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain;
2427      if (TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
2428        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
2429        Move =
2430          SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
2431                                         MVT::Other, Ops), 0);
2432        Chain = Move.getValue(1);
2433        ReplaceUses(N0.getValue(1), Chain);
2434      } else {
2435        Move =
2436          SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0);
2437        Chain = CurDAG->getEntryNode();
2438      }
2439      Chain  = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue());
2440      InFlag = Chain.getValue(1);
2441    } else {
2442      InFlag =
2443        CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
2444                             LoReg, N0, SDValue()).getValue(1);
2445      if (isSigned && !signBitIsZero) {
2446        // Sign extend the low part into the high part.
2447        InFlag =
2448          SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
2449      } else {
2450        // Zero out the high part, effectively zero extending the input.
2451        SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
2452        switch (NVT.getSimpleVT().SimpleTy) {
2453        case MVT::i16:
2454          ClrNode =
2455              SDValue(CurDAG->getMachineNode(
2456                          TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
2457                          CurDAG->getTargetConstant(X86::sub_16bit, MVT::i32)),
2458                      0);
2459          break;
2460        case MVT::i32:
2461          break;
2462        case MVT::i64:
2463          ClrNode =
2464              SDValue(CurDAG->getMachineNode(
2465                          TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
2466                          CurDAG->getTargetConstant(0, MVT::i64), ClrNode,
2467                          CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)),
2468                      0);
2469          break;
2470        default:
2471          llvm_unreachable("Unexpected division source");
2472        }
2473
2474        InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
2475                                      ClrNode, InFlag).getValue(1);
2476      }
2477    }
2478
2479    if (foldedLoad) {
2480      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
2481                        InFlag };
2482      SDNode *CNode =
2483        CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
2484      InFlag = SDValue(CNode, 1);
2485      // Update the chain.
2486      ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
2487    } else {
2488      InFlag =
2489        SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
2490    }
2491
2492    // Prevent use of AH in a REX instruction by referencing AX instead.
2493    // Shift it down 8 bits.
2494    if (HiReg == X86::AH && Subtarget->is64Bit() &&
2495        !SDValue(Node, 1).use_empty()) {
2496      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
2497                                              X86::AX, MVT::i16, InFlag);
2498      InFlag = Result.getValue(2);
2499
2500      // If we also need AL (the quotient), get it by extracting a subreg from
2501      // Result. The fast register allocator does not like multiple CopyFromReg
2502      // nodes using aliasing registers.
2503      if (!SDValue(Node, 0).use_empty())
2504        ReplaceUses(SDValue(Node, 0),
2505          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
2506
2507      // Shift AX right by 8 bits instead of using AH.
2508      Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
2509                                         Result,
2510                                         CurDAG->getTargetConstant(8, MVT::i8)),
2511                       0);
2512      ReplaceUses(SDValue(Node, 1),
2513        CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
2514    }
2515    // Copy the division (low) result, if it is needed.
2516    if (!SDValue(Node, 0).use_empty()) {
2517      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
2518                                                LoReg, NVT, InFlag);
2519      InFlag = Result.getValue(2);
2520      ReplaceUses(SDValue(Node, 0), Result);
2521      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
2522    }
2523    // Copy the remainder (high) result, if it is needed.
2524    if (!SDValue(Node, 1).use_empty()) {
2525      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
2526                                              HiReg, NVT, InFlag);
2527      InFlag = Result.getValue(2);
2528      ReplaceUses(SDValue(Node, 1), Result);
2529      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
2530    }
2531    return NULL;
2532  }
2533
2534  case X86ISD::CMP:
2535  case X86ISD::SUB: {
2536    // Sometimes a SUB is used to perform comparison.
2537    if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0))
2538      // This node is not a CMP.
2539      break;
2540    SDValue N0 = Node->getOperand(0);
2541    SDValue N1 = Node->getOperand(1);
2542
2543    // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
2544    // use a smaller encoding.
2545    if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
2546        HasNoSignedComparisonUses(Node))
2547      // Look past the truncate if CMP is the only use of it.
2548      N0 = N0.getOperand(0);
2549    if ((N0.getNode()->getOpcode() == ISD::AND ||
2550         (N0.getResNo() == 0 && N0.getNode()->getOpcode() == X86ISD::AND)) &&
2551        N0.getNode()->hasOneUse() &&
2552        N0.getValueType() != MVT::i8 &&
2553        X86::isZeroNode(N1)) {
2554      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1));
2555      if (!C) break;
2556
2557      // For example, convert "testl %eax, $8" to "testb %al, $8"
2558      if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 &&
2559          (!(C->getZExtValue() & 0x80) ||
2560           HasNoSignedComparisonUses(Node))) {
2561        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i8);
2562        SDValue Reg = N0.getNode()->getOperand(0);
2563
2564        // On x86-32, only the ABCD registers have 8-bit subregisters.
2565        if (!Subtarget->is64Bit()) {
2566          const TargetRegisterClass *TRC;
2567          switch (N0.getValueType().getSimpleVT().SimpleTy) {
2568          case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
2569          case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
2570          default: llvm_unreachable("Unsupported TEST operand type!");
2571          }
2572          SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32);
2573          Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
2574                                               Reg.getValueType(), Reg, RC), 0);
2575        }
2576
2577        // Extract the l-register.
2578        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl,
2579                                                        MVT::i8, Reg);
2580
2581        // Emit a testb.
2582        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
2583                                                 Subreg, Imm);
2584        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
2585        // one, do not call ReplaceAllUsesWith.
2586        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
2587                    SDValue(NewNode, 0));
2588        return NULL;
2589      }
2590
2591      // For example, "testl %eax, $2048" to "testb %ah, $8".
2592      if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 &&
2593          (!(C->getZExtValue() & 0x8000) ||
2594           HasNoSignedComparisonUses(Node))) {
2595        // Shift the immediate right by 8 bits.
2596        SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
2597                                                       MVT::i8);
2598        SDValue Reg = N0.getNode()->getOperand(0);
2599
2600        // Put the value in an ABCD register.
2601        const TargetRegisterClass *TRC;
2602        switch (N0.getValueType().getSimpleVT().SimpleTy) {
2603        case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break;
2604        case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
2605        case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
2606        default: llvm_unreachable("Unsupported TEST operand type!");
2607        }
2608        SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32);
2609        Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
2610                                             Reg.getValueType(), Reg, RC), 0);
2611
2612        // Extract the h-register.
2613        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl,
2614                                                        MVT::i8, Reg);
2615
2616        // Emit a testb.  The EXTRACT_SUBREG becomes a COPY that can only
2617        // target GR8_NOREX registers, so make sure the register class is
2618        // forced.
2619        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl,
2620                                                 MVT::i32, Subreg, ShiftedImm);
2621        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
2622        // one, do not call ReplaceAllUsesWith.
2623        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
2624                    SDValue(NewNode, 0));
2625        return NULL;
2626      }
2627
2628      // For example, "testl %eax, $32776" to "testw %ax, $32776".
2629      if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 &&
2630          N0.getValueType() != MVT::i16 &&
2631          (!(C->getZExtValue() & 0x8000) ||
2632           HasNoSignedComparisonUses(Node))) {
2633        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i16);
2634        SDValue Reg = N0.getNode()->getOperand(0);
2635
2636        // Extract the 16-bit subregister.
2637        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
2638                                                        MVT::i16, Reg);
2639
2640        // Emit a testw.
2641        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32,
2642                                                 Subreg, Imm);
2643        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
2644        // one, do not call ReplaceAllUsesWith.
2645        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
2646                    SDValue(NewNode, 0));
2647        return NULL;
2648      }
2649
2650      // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
2651      if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 &&
2652          N0.getValueType() == MVT::i64 &&
2653          (!(C->getZExtValue() & 0x80000000) ||
2654           HasNoSignedComparisonUses(Node))) {
2655        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
2656        SDValue Reg = N0.getNode()->getOperand(0);
2657
2658        // Extract the 32-bit subregister.
2659        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
2660                                                        MVT::i32, Reg);
2661
2662        // Emit a testl.
2663        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32,
2664                                                 Subreg, Imm);
2665        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
2666        // one, do not call ReplaceAllUsesWith.
2667        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
2668                    SDValue(NewNode, 0));
2669        return NULL;
2670      }
2671    }
2672    break;
2673  }
2674  case ISD::STORE: {
2675    // Change a chain of {load; incr or dec; store} of the same value into
2676    // a simple increment or decrement through memory of that value, if the
2677    // uses of the modified value and its address are suitable.
2678    // The DEC64m tablegen pattern is currently not able to match the case where
2679    // the EFLAGS on the original DEC are used. (This also applies to
2680    // {INC,DEC}X{64,32,16,8}.)
2681    // We'll need to improve tablegen to allow flags to be transferred from a
2682    // node in the pattern to the result node.  probably with a new keyword
2683    // for example, we have this
2684    // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2685    //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2686    //   (implicit EFLAGS)]>;
2687    // but maybe need something like this
2688    // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2689    //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2690    //   (transferrable EFLAGS)]>;
2691
2692    StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
2693    SDValue StoredVal = StoreNode->getOperand(1);
2694    unsigned Opc = StoredVal->getOpcode();
2695
2696    LoadSDNode *LoadNode = 0;
2697    SDValue InputChain;
2698    if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG,
2699                             LoadNode, InputChain))
2700      break;
2701
2702    SDValue Base, Scale, Index, Disp, Segment;
2703    if (!SelectAddr(LoadNode, LoadNode->getBasePtr(),
2704                    Base, Scale, Index, Disp, Segment))
2705      break;
2706
2707    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2);
2708    MemOp[0] = StoreNode->getMemOperand();
2709    MemOp[1] = LoadNode->getMemOperand();
2710    const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain };
2711    EVT LdVT = LoadNode->getMemoryVT();
2712    unsigned newOpc = getFusedLdStOpcode(LdVT, Opc);
2713    MachineSDNode *Result = CurDAG->getMachineNode(newOpc,
2714                                                   SDLoc(Node),
2715                                                   MVT::i32, MVT::Other, Ops);
2716    Result->setMemRefs(MemOp, MemOp + 2);
2717
2718    ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
2719    ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
2720
2721    return Result;
2722  }
2723  }
2724
2725  SDNode *ResNode = SelectCode(Node);
2726
2727  DEBUG(dbgs() << "=> ";
2728        if (ResNode == NULL || ResNode == Node)
2729          Node->dump(CurDAG);
2730        else
2731          ResNode->dump(CurDAG);
2732        dbgs() << '\n');
2733
2734  return ResNode;
2735}
2736
2737bool X86DAGToDAGISel::
2738SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
2739                             std::vector<SDValue> &OutOps) {
2740  SDValue Op0, Op1, Op2, Op3, Op4;
2741  switch (ConstraintCode) {
2742  case 'o':   // offsetable        ??
2743  case 'v':   // not offsetable    ??
2744  default: return true;
2745  case 'm':   // memory
2746    if (!SelectAddr(0, Op, Op0, Op1, Op2, Op3, Op4))
2747      return true;
2748    break;
2749  }
2750
2751  OutOps.push_back(Op0);
2752  OutOps.push_back(Op1);
2753  OutOps.push_back(Op2);
2754  OutOps.push_back(Op3);
2755  OutOps.push_back(Op4);
2756  return false;
2757}
2758
2759/// createX86ISelDag - This pass converts a legalized DAG into a
2760/// X86-specific DAG, ready for instruction scheduling.
2761///
2762FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
2763                                     CodeGenOpt::Level OptLevel) {
2764  return new X86DAGToDAGISel(TM, OptLevel);
2765}
2766