X86ISelLowering.cpp revision 0e6d230abdbf6ba67a2676c118431a4df8fb15dd
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "x86-isel"
16#include "X86.h"
17#include "X86InstrBuilder.h"
18#include "X86ISelLowering.h"
19#include "X86TargetMachine.h"
20#include "X86TargetObjectFile.h"
21#include "Utils/X86ShuffleDecode.h"
22#include "llvm/CallingConv.h"
23#include "llvm/Constants.h"
24#include "llvm/DerivedTypes.h"
25#include "llvm/GlobalAlias.h"
26#include "llvm/GlobalVariable.h"
27#include "llvm/Function.h"
28#include "llvm/Instructions.h"
29#include "llvm/Intrinsics.h"
30#include "llvm/LLVMContext.h"
31#include "llvm/CodeGen/IntrinsicLowering.h"
32#include "llvm/CodeGen/MachineFrameInfo.h"
33#include "llvm/CodeGen/MachineFunction.h"
34#include "llvm/CodeGen/MachineInstrBuilder.h"
35#include "llvm/CodeGen/MachineJumpTableInfo.h"
36#include "llvm/CodeGen/MachineModuleInfo.h"
37#include "llvm/CodeGen/MachineRegisterInfo.h"
38#include "llvm/CodeGen/PseudoSourceValue.h"
39#include "llvm/MC/MCAsmInfo.h"
40#include "llvm/MC/MCContext.h"
41#include "llvm/MC/MCExpr.h"
42#include "llvm/MC/MCSymbol.h"
43#include "llvm/ADT/BitVector.h"
44#include "llvm/ADT/SmallSet.h"
45#include "llvm/ADT/Statistic.h"
46#include "llvm/ADT/StringExtras.h"
47#include "llvm/ADT/VectorExtras.h"
48#include "llvm/Support/CallSite.h"
49#include "llvm/Support/Debug.h"
50#include "llvm/Support/Dwarf.h"
51#include "llvm/Support/ErrorHandling.h"
52#include "llvm/Support/MathExtras.h"
53#include "llvm/Support/raw_ostream.h"
54using namespace llvm;
55using namespace dwarf;
56
57STATISTIC(NumTailCalls, "Number of tail calls");
58
59// Forward declarations.
60static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
61                       SDValue V2);
62
63static SDValue Insert128BitVector(SDValue Result,
64                                  SDValue Vec,
65                                  SDValue Idx,
66                                  SelectionDAG &DAG,
67                                  DebugLoc dl);
68
69static SDValue Extract128BitVector(SDValue Vec,
70                                   SDValue Idx,
71                                   SelectionDAG &DAG,
72                                   DebugLoc dl);
73
74/// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
75/// sets things up to match to an AVX VEXTRACTF128 instruction or a
76/// simple subregister reference.  Idx is an index in the 128 bits we
77/// want.  It need not be aligned to a 128-bit bounday.  That makes
78/// lowering EXTRACT_VECTOR_ELT operations easier.
79static SDValue Extract128BitVector(SDValue Vec,
80                                   SDValue Idx,
81                                   SelectionDAG &DAG,
82                                   DebugLoc dl) {
83  EVT VT = Vec.getValueType();
84  assert(VT.getSizeInBits() == 256 && "Unexpected vector size!");
85  EVT ElVT = VT.getVectorElementType();
86  int Factor = VT.getSizeInBits()/128;
87  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
88                                  VT.getVectorNumElements()/Factor);
89
90  // Extract from UNDEF is UNDEF.
91  if (Vec.getOpcode() == ISD::UNDEF)
92    return DAG.getNode(ISD::UNDEF, dl, ResultVT);
93
94  if (isa<ConstantSDNode>(Idx)) {
95    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
96
97    // Extract the relevant 128 bits.  Generate an EXTRACT_SUBVECTOR
98    // we can match to VEXTRACTF128.
99    unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
100
101    // This is the index of the first element of the 128-bit chunk
102    // we want.
103    unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
104                                 * ElemsPerChunk);
105
106    SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
107    SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
108                                 VecIdx);
109
110    return Result;
111  }
112
113  return SDValue();
114}
115
116/// Generate a DAG to put 128-bits into a vector > 128 bits.  This
117/// sets things up to match to an AVX VINSERTF128 instruction or a
118/// simple superregister reference.  Idx is an index in the 128 bits
119/// we want.  It need not be aligned to a 128-bit bounday.  That makes
120/// lowering INSERT_VECTOR_ELT operations easier.
121static SDValue Insert128BitVector(SDValue Result,
122                                  SDValue Vec,
123                                  SDValue Idx,
124                                  SelectionDAG &DAG,
125                                  DebugLoc dl) {
126  if (isa<ConstantSDNode>(Idx)) {
127    EVT VT = Vec.getValueType();
128    assert(VT.getSizeInBits() == 128 && "Unexpected vector size!");
129
130    EVT ElVT = VT.getVectorElementType();
131    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
132    EVT ResultVT = Result.getValueType();
133
134    // Insert the relevant 128 bits.
135    unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
136
137    // This is the index of the first element of the 128-bit chunk
138    // we want.
139    unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
140                                 * ElemsPerChunk);
141
142    SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
143    Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
144                         VecIdx);
145    return Result;
146  }
147
148  return SDValue();
149}
150
151static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
152  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
153  bool is64Bit = Subtarget->is64Bit();
154
155  if (Subtarget->isTargetEnvMacho()) {
156    if (is64Bit)
157      return new X8664_MachoTargetObjectFile();
158    return new TargetLoweringObjectFileMachO();
159  }
160
161  if (Subtarget->isTargetELF())
162    return new TargetLoweringObjectFileELF();
163  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
164    return new TargetLoweringObjectFileCOFF();
165  llvm_unreachable("unknown subtarget type");
166}
167
168X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
169  : TargetLowering(TM, createTLOF(TM)) {
170  Subtarget = &TM.getSubtarget<X86Subtarget>();
171  X86ScalarSSEf64 = Subtarget->hasXMMInt() || Subtarget->hasAVX();
172  X86ScalarSSEf32 = Subtarget->hasXMM() || Subtarget->hasAVX();
173  X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
174
175  RegInfo = TM.getRegisterInfo();
176  TD = getTargetData();
177
178  // Set up the TargetLowering object.
179  static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
180
181  // X86 is weird, it always uses i8 for shift amounts and setcc results.
182  setBooleanContents(ZeroOrOneBooleanContent);
183
184  // For 64-bit since we have so many registers use the ILP scheduler, for
185  // 32-bit code use the register pressure specific scheduling.
186  if (Subtarget->is64Bit())
187    setSchedulingPreference(Sched::ILP);
188  else
189    setSchedulingPreference(Sched::RegPressure);
190  setStackPointerRegisterToSaveRestore(X86StackPtr);
191
192  if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
193    // Setup Windows compiler runtime calls.
194    setLibcallName(RTLIB::SDIV_I64, "_alldiv");
195    setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
196    setLibcallName(RTLIB::SREM_I64, "_allrem");
197    setLibcallName(RTLIB::UREM_I64, "_aullrem");
198    setLibcallName(RTLIB::MUL_I64, "_allmul");
199    setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2");
200    setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2");
201    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
202    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
203    setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
204    setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
205    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
206    setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C);
207    setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C);
208  }
209
210  if (Subtarget->isTargetDarwin()) {
211    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
212    setUseUnderscoreSetJmp(false);
213    setUseUnderscoreLongJmp(false);
214  } else if (Subtarget->isTargetMingw()) {
215    // MS runtime is weird: it exports _setjmp, but longjmp!
216    setUseUnderscoreSetJmp(true);
217    setUseUnderscoreLongJmp(false);
218  } else {
219    setUseUnderscoreSetJmp(true);
220    setUseUnderscoreLongJmp(true);
221  }
222
223  // Set up the register classes.
224  addRegisterClass(MVT::i8, X86::GR8RegisterClass);
225  addRegisterClass(MVT::i16, X86::GR16RegisterClass);
226  addRegisterClass(MVT::i32, X86::GR32RegisterClass);
227  if (Subtarget->is64Bit())
228    addRegisterClass(MVT::i64, X86::GR64RegisterClass);
229
230  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
231
232  // We don't accept any truncstore of integer registers.
233  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
234  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
235  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
236  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
237  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
238  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
239
240  // SETOEQ and SETUNE require checking two conditions.
241  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
242  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
243  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
244  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
245  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
246  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
247
248  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
249  // operation.
250  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
251  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
252  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
253
254  if (Subtarget->is64Bit()) {
255    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
256    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Expand);
257  } else if (!UseSoftFloat) {
258    // We have an algorithm for SSE2->double, and we turn this into a
259    // 64-bit FILD followed by conditional FADD for other targets.
260    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
261    // We have an algorithm for SSE2, and we turn this into a 64-bit
262    // FILD for other targets.
263    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
264  }
265
266  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
267  // this operation.
268  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
269  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
270
271  if (!UseSoftFloat) {
272    // SSE has no i16 to fp conversion, only i32
273    if (X86ScalarSSEf32) {
274      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
275      // f32 and f64 cases are Legal, f80 case is not
276      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
277    } else {
278      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
279      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
280    }
281  } else {
282    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
283    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
284  }
285
286  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
287  // are Legal, f80 is custom lowered.
288  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
289  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
290
291  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
292  // this operation.
293  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
294  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
295
296  if (X86ScalarSSEf32) {
297    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
298    // f32 and f64 cases are Legal, f80 case is not
299    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
300  } else {
301    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
302    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
303  }
304
305  // Handle FP_TO_UINT by promoting the destination to a larger signed
306  // conversion.
307  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
308  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
309  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
310
311  if (Subtarget->is64Bit()) {
312    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
313    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
314  } else if (!UseSoftFloat) {
315    if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
316      // Expand FP_TO_UINT into a select.
317      // FIXME: We would like to use a Custom expander here eventually to do
318      // the optimal thing for SSE vs. the default expansion in the legalizer.
319      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
320    else
321      // With SSE3 we can use fisttpll to convert to a signed i64; without
322      // SSE, we're stuck with a fistpll.
323      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
324  }
325
326  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
327  if (!X86ScalarSSEf64) {
328    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
329    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
330    if (Subtarget->is64Bit()) {
331      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
332      // Without SSE, i64->f64 goes through memory.
333      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
334    }
335  }
336
337  // Scalar integer divide and remainder are lowered to use operations that
338  // produce two results, to match the available instructions. This exposes
339  // the two-result form to trivial CSE, which is able to combine x/y and x%y
340  // into a single instruction.
341  //
342  // Scalar integer multiply-high is also lowered to use two-result
343  // operations, to match the available instructions. However, plain multiply
344  // (low) operations are left as Legal, as there are single-result
345  // instructions for this in x86. Using the two-result multiply instructions
346  // when both high and low results are needed must be arranged by dagcombine.
347  for (unsigned i = 0, e = 4; i != e; ++i) {
348    MVT VT = IntVTs[i];
349    setOperationAction(ISD::MULHS, VT, Expand);
350    setOperationAction(ISD::MULHU, VT, Expand);
351    setOperationAction(ISD::SDIV, VT, Expand);
352    setOperationAction(ISD::UDIV, VT, Expand);
353    setOperationAction(ISD::SREM, VT, Expand);
354    setOperationAction(ISD::UREM, VT, Expand);
355
356    // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
357    setOperationAction(ISD::ADDC, VT, Custom);
358    setOperationAction(ISD::ADDE, VT, Custom);
359    setOperationAction(ISD::SUBC, VT, Custom);
360    setOperationAction(ISD::SUBE, VT, Custom);
361  }
362
363  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
364  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
365  setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
366  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
367  if (Subtarget->is64Bit())
368    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
369  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
370  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
371  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
372  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
373  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
374  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
375  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
376  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
377
378  setOperationAction(ISD::CTTZ             , MVT::i8   , Custom);
379  setOperationAction(ISD::CTLZ             , MVT::i8   , Custom);
380  setOperationAction(ISD::CTTZ             , MVT::i16  , Custom);
381  setOperationAction(ISD::CTLZ             , MVT::i16  , Custom);
382  setOperationAction(ISD::CTTZ             , MVT::i32  , Custom);
383  setOperationAction(ISD::CTLZ             , MVT::i32  , Custom);
384  if (Subtarget->is64Bit()) {
385    setOperationAction(ISD::CTTZ           , MVT::i64  , Custom);
386    setOperationAction(ISD::CTLZ           , MVT::i64  , Custom);
387  }
388
389  if (Subtarget->hasPOPCNT()) {
390    setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
391  } else {
392    setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
393    setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
394    setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
395    if (Subtarget->is64Bit())
396      setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
397  }
398
399  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
400  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
401
402  // These should be promoted to a larger select which is supported.
403  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
404  // X86 wants to expand cmov itself.
405  setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
406  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
407  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
408  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
409  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
410  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
411  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
412  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
413  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
414  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
415  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
416  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
417  if (Subtarget->is64Bit()) {
418    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
419    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
420  }
421  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
422
423  // Darwin ABI issue.
424  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
425  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
426  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
427  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
428  if (Subtarget->is64Bit())
429    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
430  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
431  setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
432  if (Subtarget->is64Bit()) {
433    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
434    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
435    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
436    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
437    setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
438  }
439  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
440  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
441  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
442  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
443  if (Subtarget->is64Bit()) {
444    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
445    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
446    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
447  }
448
449  if (Subtarget->hasXMM())
450    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
451
452  setOperationAction(ISD::MEMBARRIER    , MVT::Other, Custom);
453  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
454
455  // On X86 and X86-64, atomic operations are lowered to locked instructions.
456  // Locked instructions, in turn, have implicit fence semantics (all memory
457  // operations are flushed before issuing the locked instruction, and they
458  // are not buffered), so we can fold away the common pattern of
459  // fence-atomic-fence.
460  setShouldFoldAtomicFences(true);
461
462  // Expand certain atomics
463  for (unsigned i = 0, e = 4; i != e; ++i) {
464    MVT VT = IntVTs[i];
465    setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
466    setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
467  }
468
469  if (!Subtarget->is64Bit()) {
470    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
471    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
472    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
473    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
474    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
475    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
476    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
477  }
478
479  // FIXME - use subtarget debug flags
480  if (!Subtarget->isTargetDarwin() &&
481      !Subtarget->isTargetELF() &&
482      !Subtarget->isTargetCygMing()) {
483    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
484  }
485
486  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
487  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
488  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
489  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
490  if (Subtarget->is64Bit()) {
491    setExceptionPointerRegister(X86::RAX);
492    setExceptionSelectorRegister(X86::RDX);
493  } else {
494    setExceptionPointerRegister(X86::EAX);
495    setExceptionSelectorRegister(X86::EDX);
496  }
497  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
498  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
499
500  setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
501
502  setOperationAction(ISD::TRAP, MVT::Other, Legal);
503
504  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
505  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
506  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
507  if (Subtarget->is64Bit()) {
508    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
509    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
510  } else {
511    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
512    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
513  }
514
515  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
516  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
517  setOperationAction(ISD::DYNAMIC_STACKALLOC,
518                     (Subtarget->is64Bit() ? MVT::i64 : MVT::i32),
519                     (Subtarget->isTargetCOFF()
520                      && !Subtarget->isTargetEnvMacho()
521                      ? Custom : Expand));
522
523  if (!UseSoftFloat && X86ScalarSSEf64) {
524    // f32 and f64 use SSE.
525    // Set up the FP register classes.
526    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
527    addRegisterClass(MVT::f64, X86::FR64RegisterClass);
528
529    // Use ANDPD to simulate FABS.
530    setOperationAction(ISD::FABS , MVT::f64, Custom);
531    setOperationAction(ISD::FABS , MVT::f32, Custom);
532
533    // Use XORP to simulate FNEG.
534    setOperationAction(ISD::FNEG , MVT::f64, Custom);
535    setOperationAction(ISD::FNEG , MVT::f32, Custom);
536
537    // Use ANDPD and ORPD to simulate FCOPYSIGN.
538    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
539    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
540
541    // Lower this to FGETSIGNx86 plus an AND.
542    setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
543    setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
544
545    // We don't support sin/cos/fmod
546    setOperationAction(ISD::FSIN , MVT::f64, Expand);
547    setOperationAction(ISD::FCOS , MVT::f64, Expand);
548    setOperationAction(ISD::FSIN , MVT::f32, Expand);
549    setOperationAction(ISD::FCOS , MVT::f32, Expand);
550
551    // Expand FP immediates into loads from the stack, except for the special
552    // cases we handle.
553    addLegalFPImmediate(APFloat(+0.0)); // xorpd
554    addLegalFPImmediate(APFloat(+0.0f)); // xorps
555  } else if (!UseSoftFloat && X86ScalarSSEf32) {
556    // Use SSE for f32, x87 for f64.
557    // Set up the FP register classes.
558    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
559    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
560
561    // Use ANDPS to simulate FABS.
562    setOperationAction(ISD::FABS , MVT::f32, Custom);
563
564    // Use XORP to simulate FNEG.
565    setOperationAction(ISD::FNEG , MVT::f32, Custom);
566
567    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
568
569    // Use ANDPS and ORPS to simulate FCOPYSIGN.
570    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
571    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
572
573    // We don't support sin/cos/fmod
574    setOperationAction(ISD::FSIN , MVT::f32, Expand);
575    setOperationAction(ISD::FCOS , MVT::f32, Expand);
576
577    // Special cases we handle for FP constants.
578    addLegalFPImmediate(APFloat(+0.0f)); // xorps
579    addLegalFPImmediate(APFloat(+0.0)); // FLD0
580    addLegalFPImmediate(APFloat(+1.0)); // FLD1
581    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
582    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
583
584    if (!UnsafeFPMath) {
585      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
586      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
587    }
588  } else if (!UseSoftFloat) {
589    // f32 and f64 in x87.
590    // Set up the FP register classes.
591    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
592    addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
593
594    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
595    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
596    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
597    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
598
599    if (!UnsafeFPMath) {
600      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
601      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
602    }
603    addLegalFPImmediate(APFloat(+0.0)); // FLD0
604    addLegalFPImmediate(APFloat(+1.0)); // FLD1
605    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
606    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
607    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
608    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
609    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
610    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
611  }
612
613  // We don't support FMA.
614  setOperationAction(ISD::FMA, MVT::f64, Expand);
615  setOperationAction(ISD::FMA, MVT::f32, Expand);
616
617  // Long double always uses X87.
618  if (!UseSoftFloat) {
619    addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
620    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
621    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
622    {
623      APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
624      addLegalFPImmediate(TmpFlt);  // FLD0
625      TmpFlt.changeSign();
626      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
627
628      bool ignored;
629      APFloat TmpFlt2(+1.0);
630      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
631                      &ignored);
632      addLegalFPImmediate(TmpFlt2);  // FLD1
633      TmpFlt2.changeSign();
634      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
635    }
636
637    if (!UnsafeFPMath) {
638      setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
639      setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
640    }
641
642    setOperationAction(ISD::FMA, MVT::f80, Expand);
643  }
644
645  // Always use a library call for pow.
646  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
647  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
648  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
649
650  setOperationAction(ISD::FLOG, MVT::f80, Expand);
651  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
652  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
653  setOperationAction(ISD::FEXP, MVT::f80, Expand);
654  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
655
656  // First set operation action for all vector types to either promote
657  // (for widening) or expand (for scalarization). Then we will selectively
658  // turn on ones that can be effectively codegen'd.
659  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
660       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
661    setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
662    setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
663    setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
664    setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
665    setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
666    setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
667    setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
668    setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
669    setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
670    setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
671    setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
672    setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
673    setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
674    setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand);
675    setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand);
676    setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand);
677    setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
678    setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
679    setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
680    setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
681    setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
682    setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
683    setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
684    setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
685    setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
686    setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
687    setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
688    setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
689    setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
690    setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
691    setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
692    setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
693    setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
694    setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
695    setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
696    setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
697    setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
698    setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
699    setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
700    setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
701    setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
702    setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
703    setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
704    setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
705    setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
706    setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand);
707    setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand);
708    setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
709    setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
710    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand);
711    setOperationAction(ISD::TRUNCATE,  (MVT::SimpleValueType)VT, Expand);
712    setOperationAction(ISD::SIGN_EXTEND,  (MVT::SimpleValueType)VT, Expand);
713    setOperationAction(ISD::ZERO_EXTEND,  (MVT::SimpleValueType)VT, Expand);
714    setOperationAction(ISD::ANY_EXTEND,  (MVT::SimpleValueType)VT, Expand);
715    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
716         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
717      setTruncStoreAction((MVT::SimpleValueType)VT,
718                          (MVT::SimpleValueType)InnerVT, Expand);
719    setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
720    setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
721    setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
722  }
723
724  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
725  // with -msoft-float, disable use of MMX as well.
726  if (!UseSoftFloat && Subtarget->hasMMX()) {
727    addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass);
728    // No operations on x86mmx supported, everything uses intrinsics.
729  }
730
731  // MMX-sized vectors (other than x86mmx) are expected to be expanded
732  // into smaller operations.
733  setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
734  setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
735  setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
736  setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
737  setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
738  setOperationAction(ISD::AND,                MVT::v4i16, Expand);
739  setOperationAction(ISD::AND,                MVT::v2i32, Expand);
740  setOperationAction(ISD::AND,                MVT::v1i64, Expand);
741  setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
742  setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
743  setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
744  setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
745  setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
746  setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
747  setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
748  setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
749  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
750  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
751  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
752  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
753  setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
754  setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
755  setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
756  setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
757  setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
758  setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
759  setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
760  setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
761  setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
762
763  if (!UseSoftFloat && Subtarget->hasXMM()) {
764    addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
765
766    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
767    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
768    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
769    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
770    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
771    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
772    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
773    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
774    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
775    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
776    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
777    setOperationAction(ISD::VSETCC,             MVT::v4f32, Custom);
778  }
779
780  if (!UseSoftFloat && Subtarget->hasXMMInt()) {
781    addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
782
783    // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
784    // registers cannot be used even for integer operations.
785    addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
786    addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
787    addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
788    addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
789
790    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
791    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
792    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
793    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
794    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
795    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
796    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
797    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
798    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
799    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
800    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
801    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
802    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
803    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
804    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
805    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
806
807    setOperationAction(ISD::VSETCC,             MVT::v2f64, Custom);
808    setOperationAction(ISD::VSETCC,             MVT::v16i8, Custom);
809    setOperationAction(ISD::VSETCC,             MVT::v8i16, Custom);
810    setOperationAction(ISD::VSETCC,             MVT::v4i32, Custom);
811
812    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
813    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
814    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
815    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
816    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
817
818    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2f64, Custom);
819    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2i64, Custom);
820    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i8, Custom);
821    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i16, Custom);
822    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i32, Custom);
823
824    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
825    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
826      EVT VT = (MVT::SimpleValueType)i;
827      // Do not attempt to custom lower non-power-of-2 vectors
828      if (!isPowerOf2_32(VT.getVectorNumElements()))
829        continue;
830      // Do not attempt to custom lower non-128-bit vectors
831      if (!VT.is128BitVector())
832        continue;
833      setOperationAction(ISD::BUILD_VECTOR,
834                         VT.getSimpleVT().SimpleTy, Custom);
835      setOperationAction(ISD::VECTOR_SHUFFLE,
836                         VT.getSimpleVT().SimpleTy, Custom);
837      setOperationAction(ISD::EXTRACT_VECTOR_ELT,
838                         VT.getSimpleVT().SimpleTy, Custom);
839    }
840
841    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
842    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
843    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
844    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
845    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
846    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
847
848    if (Subtarget->is64Bit()) {
849      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
850      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
851    }
852
853    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
854    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
855      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
856      EVT VT = SVT;
857
858      // Do not attempt to promote non-128-bit vectors
859      if (!VT.is128BitVector())
860        continue;
861
862      setOperationAction(ISD::AND,    SVT, Promote);
863      AddPromotedToType (ISD::AND,    SVT, MVT::v2i64);
864      setOperationAction(ISD::OR,     SVT, Promote);
865      AddPromotedToType (ISD::OR,     SVT, MVT::v2i64);
866      setOperationAction(ISD::XOR,    SVT, Promote);
867      AddPromotedToType (ISD::XOR,    SVT, MVT::v2i64);
868      setOperationAction(ISD::LOAD,   SVT, Promote);
869      AddPromotedToType (ISD::LOAD,   SVT, MVT::v2i64);
870      setOperationAction(ISD::SELECT, SVT, Promote);
871      AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64);
872    }
873
874    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
875
876    // Custom lower v2i64 and v2f64 selects.
877    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
878    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
879    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
880    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
881
882    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
883    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
884  }
885
886  if (Subtarget->hasSSE41() || Subtarget->hasAVX()) {
887    setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
888    setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
889    setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
890    setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
891    setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
892    setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
893    setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
894    setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
895    setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
896    setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
897
898    // FIXME: Do we need to handle scalar-to-vector here?
899    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
900
901    // Can turn SHL into an integer multiply.
902    setOperationAction(ISD::SHL,                MVT::v4i32, Custom);
903    setOperationAction(ISD::SHL,                MVT::v16i8, Custom);
904
905    // i8 and i16 vectors are custom , because the source register and source
906    // source memory operand types are not the same width.  f32 vectors are
907    // custom since the immediate controlling the insert encodes additional
908    // information.
909    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
910    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
911    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
912    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
913
914    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
915    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
916    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
917    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
918
919    if (Subtarget->is64Bit()) {
920      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Legal);
921      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
922    }
923  }
924
925  if (Subtarget->hasSSE2() || Subtarget->hasAVX()) {
926    setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
927    setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
928    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
929    setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
930
931    setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
932    setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
933    setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
934
935    setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
936    setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
937  }
938
939  if (Subtarget->hasSSE42() || Subtarget->hasAVX())
940    setOperationAction(ISD::VSETCC,             MVT::v2i64, Custom);
941
942  if (!UseSoftFloat && Subtarget->hasAVX()) {
943    addRegisterClass(MVT::v32i8,  X86::VR256RegisterClass);
944    addRegisterClass(MVT::v16i16, X86::VR256RegisterClass);
945    addRegisterClass(MVT::v8i32,  X86::VR256RegisterClass);
946    addRegisterClass(MVT::v8f32,  X86::VR256RegisterClass);
947    addRegisterClass(MVT::v4i64,  X86::VR256RegisterClass);
948    addRegisterClass(MVT::v4f64,  X86::VR256RegisterClass);
949
950    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
951    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
952    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
953
954    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
955    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
956    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
957    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
958    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
959    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
960
961    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
962    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
963    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
964    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
965    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
966    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
967
968    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
969    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
970    setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
971
972    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4f64,  Custom);
973    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i64,  Custom);
974    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f32,  Custom);
975    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i32,  Custom);
976    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i8,  Custom);
977    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i16, Custom);
978
979    setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
980    setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
981    setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
982    setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
983
984    setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
985    setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
986    setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
987    setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
988
989    setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
990    setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
991
992    setOperationAction(ISD::VSETCC,            MVT::v8i32, Custom);
993    setOperationAction(ISD::VSETCC,            MVT::v4i64, Custom);
994
995    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
996    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
997    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
998
999    // Custom lower several nodes for 256-bit types.
1000    for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
1001                  i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
1002      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
1003      EVT VT = SVT;
1004
1005      // Extract subvector is special because the value type
1006      // (result) is 128-bit but the source is 256-bit wide.
1007      if (VT.is128BitVector())
1008        setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom);
1009
1010      // Do not attempt to custom lower other non-256-bit vectors
1011      if (!VT.is256BitVector())
1012        continue;
1013
1014      setOperationAction(ISD::BUILD_VECTOR,       SVT, Custom);
1015      setOperationAction(ISD::VECTOR_SHUFFLE,     SVT, Custom);
1016      setOperationAction(ISD::INSERT_VECTOR_ELT,  SVT, Custom);
1017      setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom);
1018      setOperationAction(ISD::SCALAR_TO_VECTOR,   SVT, Custom);
1019      setOperationAction(ISD::INSERT_SUBVECTOR,   SVT, Custom);
1020    }
1021
1022    // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1023    for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) {
1024      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
1025      EVT VT = SVT;
1026
1027      // Do not attempt to promote non-256-bit vectors
1028      if (!VT.is256BitVector())
1029        continue;
1030
1031      setOperationAction(ISD::AND,    SVT, Promote);
1032      AddPromotedToType (ISD::AND,    SVT, MVT::v4i64);
1033      setOperationAction(ISD::OR,     SVT, Promote);
1034      AddPromotedToType (ISD::OR,     SVT, MVT::v4i64);
1035      setOperationAction(ISD::XOR,    SVT, Promote);
1036      AddPromotedToType (ISD::XOR,    SVT, MVT::v4i64);
1037      setOperationAction(ISD::LOAD,   SVT, Promote);
1038      AddPromotedToType (ISD::LOAD,   SVT, MVT::v4i64);
1039      setOperationAction(ISD::SELECT, SVT, Promote);
1040      AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64);
1041    }
1042  }
1043
1044  // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1045  // of this type with custom code.
1046  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
1047         VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) {
1048    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, Custom);
1049  }
1050
1051  // We want to custom lower some of our intrinsics.
1052  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1053
1054
1055  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1056  // handle type legalization for these operations here.
1057  //
1058  // FIXME: We really should do custom legalization for addition and
1059  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1060  // than generic legalization for 64-bit multiplication-with-overflow, though.
1061  for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1062    // Add/Sub/Mul with overflow operations are custom lowered.
1063    MVT VT = IntVTs[i];
1064    setOperationAction(ISD::SADDO, VT, Custom);
1065    setOperationAction(ISD::UADDO, VT, Custom);
1066    setOperationAction(ISD::SSUBO, VT, Custom);
1067    setOperationAction(ISD::USUBO, VT, Custom);
1068    setOperationAction(ISD::SMULO, VT, Custom);
1069    setOperationAction(ISD::UMULO, VT, Custom);
1070  }
1071
1072  // There are no 8-bit 3-address imul/mul instructions
1073  setOperationAction(ISD::SMULO, MVT::i8, Expand);
1074  setOperationAction(ISD::UMULO, MVT::i8, Expand);
1075
1076  if (!Subtarget->is64Bit()) {
1077    // These libcalls are not available in 32-bit.
1078    setLibcallName(RTLIB::SHL_I128, 0);
1079    setLibcallName(RTLIB::SRL_I128, 0);
1080    setLibcallName(RTLIB::SRA_I128, 0);
1081  }
1082
1083  // We have target-specific dag combine patterns for the following nodes:
1084  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1085  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1086  setTargetDAGCombine(ISD::BUILD_VECTOR);
1087  setTargetDAGCombine(ISD::SELECT);
1088  setTargetDAGCombine(ISD::SHL);
1089  setTargetDAGCombine(ISD::SRA);
1090  setTargetDAGCombine(ISD::SRL);
1091  setTargetDAGCombine(ISD::OR);
1092  setTargetDAGCombine(ISD::AND);
1093  setTargetDAGCombine(ISD::ADD);
1094  setTargetDAGCombine(ISD::SUB);
1095  setTargetDAGCombine(ISD::STORE);
1096  setTargetDAGCombine(ISD::ZERO_EXTEND);
1097  setTargetDAGCombine(ISD::SINT_TO_FP);
1098  if (Subtarget->is64Bit())
1099    setTargetDAGCombine(ISD::MUL);
1100
1101  computeRegisterProperties();
1102
1103  // On Darwin, -Os means optimize for size without hurting performance,
1104  // do not reduce the limit.
1105  maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1106  maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1107  maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1108  maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1109  maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1110  maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1111  setPrefLoopAlignment(16);
1112  benefitFromCodePlacementOpt = true;
1113
1114  setPrefFunctionAlignment(4);
1115}
1116
1117
1118MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const {
1119  return MVT::i8;
1120}
1121
1122
1123/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1124/// the desired ByVal argument alignment.
1125static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1126  if (MaxAlign == 16)
1127    return;
1128  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1129    if (VTy->getBitWidth() == 128)
1130      MaxAlign = 16;
1131  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1132    unsigned EltAlign = 0;
1133    getMaxByValAlign(ATy->getElementType(), EltAlign);
1134    if (EltAlign > MaxAlign)
1135      MaxAlign = EltAlign;
1136  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1137    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1138      unsigned EltAlign = 0;
1139      getMaxByValAlign(STy->getElementType(i), EltAlign);
1140      if (EltAlign > MaxAlign)
1141        MaxAlign = EltAlign;
1142      if (MaxAlign == 16)
1143        break;
1144    }
1145  }
1146  return;
1147}
1148
1149/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1150/// function arguments in the caller parameter area. For X86, aggregates
1151/// that contain SSE vectors are placed at 16-byte boundaries while the rest
1152/// are at 4-byte boundaries.
1153unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1154  if (Subtarget->is64Bit()) {
1155    // Max of 8 and alignment of type.
1156    unsigned TyAlign = TD->getABITypeAlignment(Ty);
1157    if (TyAlign > 8)
1158      return TyAlign;
1159    return 8;
1160  }
1161
1162  unsigned Align = 4;
1163  if (Subtarget->hasXMM())
1164    getMaxByValAlign(Ty, Align);
1165  return Align;
1166}
1167
1168/// getOptimalMemOpType - Returns the target specific optimal type for load
1169/// and store operations as a result of memset, memcpy, and memmove
1170/// lowering. If DstAlign is zero that means it's safe to destination
1171/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1172/// means there isn't a need to check it against alignment requirement,
1173/// probably because the source does not need to be loaded. If
1174/// 'NonScalarIntSafe' is true, that means it's safe to return a
1175/// non-scalar-integer type, e.g. empty string source, constant, or loaded
1176/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
1177/// constant so it does not need to be loaded.
1178/// It returns EVT::Other if the type should be determined using generic
1179/// target-independent logic.
1180EVT
1181X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1182                                       unsigned DstAlign, unsigned SrcAlign,
1183                                       bool NonScalarIntSafe,
1184                                       bool MemcpyStrSrc,
1185                                       MachineFunction &MF) const {
1186  // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
1187  // linux.  This is because the stack realignment code can't handle certain
1188  // cases like PR2962.  This should be removed when PR2962 is fixed.
1189  const Function *F = MF.getFunction();
1190  if (NonScalarIntSafe &&
1191      !F->hasFnAttr(Attribute::NoImplicitFloat)) {
1192    if (Size >= 16 &&
1193        (Subtarget->isUnalignedMemAccessFast() ||
1194         ((DstAlign == 0 || DstAlign >= 16) &&
1195          (SrcAlign == 0 || SrcAlign >= 16))) &&
1196        Subtarget->getStackAlignment() >= 16) {
1197      if (Subtarget->hasSSE2())
1198        return MVT::v4i32;
1199      if (Subtarget->hasSSE1())
1200        return MVT::v4f32;
1201    } else if (!MemcpyStrSrc && Size >= 8 &&
1202               !Subtarget->is64Bit() &&
1203               Subtarget->getStackAlignment() >= 8 &&
1204               Subtarget->hasXMMInt()) {
1205      // Do not use f64 to lower memcpy if source is string constant. It's
1206      // better to use i32 to avoid the loads.
1207      return MVT::f64;
1208    }
1209  }
1210  if (Subtarget->is64Bit() && Size >= 8)
1211    return MVT::i64;
1212  return MVT::i32;
1213}
1214
1215/// getJumpTableEncoding - Return the entry encoding for a jump table in the
1216/// current function.  The returned value is a member of the
1217/// MachineJumpTableInfo::JTEntryKind enum.
1218unsigned X86TargetLowering::getJumpTableEncoding() const {
1219  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1220  // symbol.
1221  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1222      Subtarget->isPICStyleGOT())
1223    return MachineJumpTableInfo::EK_Custom32;
1224
1225  // Otherwise, use the normal jump table encoding heuristics.
1226  return TargetLowering::getJumpTableEncoding();
1227}
1228
1229const MCExpr *
1230X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1231                                             const MachineBasicBlock *MBB,
1232                                             unsigned uid,MCContext &Ctx) const{
1233  assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1234         Subtarget->isPICStyleGOT());
1235  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1236  // entries.
1237  return MCSymbolRefExpr::Create(MBB->getSymbol(),
1238                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1239}
1240
1241/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
1242/// jumptable.
1243SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1244                                                    SelectionDAG &DAG) const {
1245  if (!Subtarget->is64Bit())
1246    // This doesn't have DebugLoc associated with it, but is not really the
1247    // same as a Register.
1248    return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy());
1249  return Table;
1250}
1251
1252/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
1253/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
1254/// MCExpr.
1255const MCExpr *X86TargetLowering::
1256getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1257                             MCContext &Ctx) const {
1258  // X86-64 uses RIP relative addressing based on the jump table label.
1259  if (Subtarget->isPICStyleRIPRel())
1260    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1261
1262  // Otherwise, the reference is relative to the PIC base.
1263  return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1264}
1265
1266// FIXME: Why this routine is here? Move to RegInfo!
1267std::pair<const TargetRegisterClass*, uint8_t>
1268X86TargetLowering::findRepresentativeClass(EVT VT) const{
1269  const TargetRegisterClass *RRC = 0;
1270  uint8_t Cost = 1;
1271  switch (VT.getSimpleVT().SimpleTy) {
1272  default:
1273    return TargetLowering::findRepresentativeClass(VT);
1274  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1275    RRC = (Subtarget->is64Bit()
1276           ? X86::GR64RegisterClass : X86::GR32RegisterClass);
1277    break;
1278  case MVT::x86mmx:
1279    RRC = X86::VR64RegisterClass;
1280    break;
1281  case MVT::f32: case MVT::f64:
1282  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1283  case MVT::v4f32: case MVT::v2f64:
1284  case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1285  case MVT::v4f64:
1286    RRC = X86::VR128RegisterClass;
1287    break;
1288  }
1289  return std::make_pair(RRC, Cost);
1290}
1291
1292bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1293                                               unsigned &Offset) const {
1294  if (!Subtarget->isTargetLinux())
1295    return false;
1296
1297  if (Subtarget->is64Bit()) {
1298    // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1299    Offset = 0x28;
1300    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1301      AddressSpace = 256;
1302    else
1303      AddressSpace = 257;
1304  } else {
1305    // %gs:0x14 on i386
1306    Offset = 0x14;
1307    AddressSpace = 256;
1308  }
1309  return true;
1310}
1311
1312
1313//===----------------------------------------------------------------------===//
1314//               Return Value Calling Convention Implementation
1315//===----------------------------------------------------------------------===//
1316
1317#include "X86GenCallingConv.inc"
1318
1319bool
1320X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
1321				  MachineFunction &MF, bool isVarArg,
1322                        const SmallVectorImpl<ISD::OutputArg> &Outs,
1323                        LLVMContext &Context) const {
1324  SmallVector<CCValAssign, 16> RVLocs;
1325  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1326                 RVLocs, Context);
1327  return CCInfo.CheckReturn(Outs, RetCC_X86);
1328}
1329
1330SDValue
1331X86TargetLowering::LowerReturn(SDValue Chain,
1332                               CallingConv::ID CallConv, bool isVarArg,
1333                               const SmallVectorImpl<ISD::OutputArg> &Outs,
1334                               const SmallVectorImpl<SDValue> &OutVals,
1335                               DebugLoc dl, SelectionDAG &DAG) const {
1336  MachineFunction &MF = DAG.getMachineFunction();
1337  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1338
1339  SmallVector<CCValAssign, 16> RVLocs;
1340  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1341                 RVLocs, *DAG.getContext());
1342  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1343
1344  // Add the regs to the liveout set for the function.
1345  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1346  for (unsigned i = 0; i != RVLocs.size(); ++i)
1347    if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg()))
1348      MRI.addLiveOut(RVLocs[i].getLocReg());
1349
1350  SDValue Flag;
1351
1352  SmallVector<SDValue, 6> RetOps;
1353  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1354  // Operand #1 = Bytes To Pop
1355  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
1356                   MVT::i16));
1357
1358  // Copy the result values into the output registers.
1359  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1360    CCValAssign &VA = RVLocs[i];
1361    assert(VA.isRegLoc() && "Can only return in registers!");
1362    SDValue ValToCopy = OutVals[i];
1363    EVT ValVT = ValToCopy.getValueType();
1364
1365    // If this is x86-64, and we disabled SSE, we can't return FP values,
1366    // or SSE or MMX vectors.
1367    if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
1368         VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
1369          (Subtarget->is64Bit() && !Subtarget->hasXMM())) {
1370      report_fatal_error("SSE register return with SSE disabled");
1371    }
1372    // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
1373    // llvm-gcc has never done it right and no one has noticed, so this
1374    // should be OK for now.
1375    if (ValVT == MVT::f64 &&
1376        (Subtarget->is64Bit() && !Subtarget->hasXMMInt()))
1377      report_fatal_error("SSE2 register return with SSE2 disabled");
1378
1379    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
1380    // the RET instruction and handled by the FP Stackifier.
1381    if (VA.getLocReg() == X86::ST0 ||
1382        VA.getLocReg() == X86::ST1) {
1383      // If this is a copy from an xmm register to ST(0), use an FPExtend to
1384      // change the value to the FP stack register class.
1385      if (isScalarFPTypeInSSEReg(VA.getValVT()))
1386        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
1387      RetOps.push_back(ValToCopy);
1388      // Don't emit a copytoreg.
1389      continue;
1390    }
1391
1392    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
1393    // which is returned in RAX / RDX.
1394    if (Subtarget->is64Bit()) {
1395      if (ValVT == MVT::x86mmx) {
1396        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
1397          ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
1398          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
1399                                  ValToCopy);
1400          // If we don't have SSE2 available, convert to v4f32 so the generated
1401          // register is legal.
1402          if (!Subtarget->hasSSE2())
1403            ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
1404        }
1405      }
1406    }
1407
1408    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
1409    Flag = Chain.getValue(1);
1410  }
1411
1412  // The x86-64 ABI for returning structs by value requires that we copy
1413  // the sret argument into %rax for the return. We saved the argument into
1414  // a virtual register in the entry block, so now we copy the value out
1415  // and into %rax.
1416  if (Subtarget->is64Bit() &&
1417      DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
1418    MachineFunction &MF = DAG.getMachineFunction();
1419    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1420    unsigned Reg = FuncInfo->getSRetReturnReg();
1421    assert(Reg &&
1422           "SRetReturnReg should have been set in LowerFormalArguments().");
1423    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
1424
1425    Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
1426    Flag = Chain.getValue(1);
1427
1428    // RAX now acts like a return value.
1429    MRI.addLiveOut(X86::RAX);
1430  }
1431
1432  RetOps[0] = Chain;  // Update chain.
1433
1434  // Add the flag if we have it.
1435  if (Flag.getNode())
1436    RetOps.push_back(Flag);
1437
1438  return DAG.getNode(X86ISD::RET_FLAG, dl,
1439                     MVT::Other, &RetOps[0], RetOps.size());
1440}
1441
1442bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const {
1443  if (N->getNumValues() != 1)
1444    return false;
1445  if (!N->hasNUsesOfValue(1, 0))
1446    return false;
1447
1448  SDNode *Copy = *N->use_begin();
1449  if (Copy->getOpcode() != ISD::CopyToReg &&
1450      Copy->getOpcode() != ISD::FP_EXTEND)
1451    return false;
1452
1453  bool HasRet = false;
1454  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
1455       UI != UE; ++UI) {
1456    if (UI->getOpcode() != X86ISD::RET_FLAG)
1457      return false;
1458    HasRet = true;
1459  }
1460
1461  return HasRet;
1462}
1463
1464EVT
1465X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
1466                                            ISD::NodeType ExtendKind) const {
1467  MVT ReturnMVT;
1468  // TODO: Is this also valid on 32-bit?
1469  if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
1470    ReturnMVT = MVT::i8;
1471  else
1472    ReturnMVT = MVT::i32;
1473
1474  EVT MinVT = getRegisterType(Context, ReturnMVT);
1475  return VT.bitsLT(MinVT) ? MinVT : VT;
1476}
1477
1478/// LowerCallResult - Lower the result values of a call into the
1479/// appropriate copies out of appropriate physical registers.
1480///
1481SDValue
1482X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1483                                   CallingConv::ID CallConv, bool isVarArg,
1484                                   const SmallVectorImpl<ISD::InputArg> &Ins,
1485                                   DebugLoc dl, SelectionDAG &DAG,
1486                                   SmallVectorImpl<SDValue> &InVals) const {
1487
1488  // Assign locations to each value returned by this call.
1489  SmallVector<CCValAssign, 16> RVLocs;
1490  bool Is64Bit = Subtarget->is64Bit();
1491  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1492		 getTargetMachine(), RVLocs, *DAG.getContext());
1493  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1494
1495  // Copy all of the result registers out of their specified physreg.
1496  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1497    CCValAssign &VA = RVLocs[i];
1498    EVT CopyVT = VA.getValVT();
1499
1500    // If this is x86-64, and we disabled SSE, we can't return FP values
1501    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
1502        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) {
1503      report_fatal_error("SSE register return with SSE disabled");
1504    }
1505
1506    SDValue Val;
1507
1508    // If this is a call to a function that returns an fp value on the floating
1509    // point stack, we must guarantee the the value is popped from the stack, so
1510    // a CopyFromReg is not good enough - the copy instruction may be eliminated
1511    // if the return value is not used. We use the FpPOP_RETVAL instruction
1512    // instead.
1513    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
1514      // If we prefer to use the value in xmm registers, copy it out as f80 and
1515      // use a truncate to move it from fp stack reg to xmm reg.
1516      if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
1517      SDValue Ops[] = { Chain, InFlag };
1518      Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
1519                                         MVT::Other, MVT::Glue, Ops, 2), 1);
1520      Val = Chain.getValue(0);
1521
1522      // Round the f80 to the right size, which also moves it to the appropriate
1523      // xmm register.
1524      if (CopyVT != VA.getValVT())
1525        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1526                          // This truncation won't change the value.
1527                          DAG.getIntPtrConstant(1));
1528    } else {
1529      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1530                                 CopyVT, InFlag).getValue(1);
1531      Val = Chain.getValue(0);
1532    }
1533    InFlag = Chain.getValue(2);
1534    InVals.push_back(Val);
1535  }
1536
1537  return Chain;
1538}
1539
1540
1541//===----------------------------------------------------------------------===//
1542//                C & StdCall & Fast Calling Convention implementation
1543//===----------------------------------------------------------------------===//
1544//  StdCall calling convention seems to be standard for many Windows' API
1545//  routines and around. It differs from C calling convention just a little:
1546//  callee should clean up the stack, not caller. Symbols should be also
1547//  decorated in some fancy way :) It doesn't support any vector arguments.
1548//  For info on fast calling convention see Fast Calling Convention (tail call)
1549//  implementation LowerX86_32FastCCCallTo.
1550
1551/// CallIsStructReturn - Determines whether a call uses struct return
1552/// semantics.
1553static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
1554  if (Outs.empty())
1555    return false;
1556
1557  return Outs[0].Flags.isSRet();
1558}
1559
1560/// ArgsAreStructReturn - Determines whether a function uses struct
1561/// return semantics.
1562static bool
1563ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
1564  if (Ins.empty())
1565    return false;
1566
1567  return Ins[0].Flags.isSRet();
1568}
1569
1570/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
1571/// by "Src" to address "Dst" with size and alignment information specified by
1572/// the specific parameter attribute. The copy will be passed as a byval
1573/// function parameter.
1574static SDValue
1575CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
1576                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1577                          DebugLoc dl) {
1578  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
1579
1580  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
1581                       /*isVolatile*/false, /*AlwaysInline=*/true,
1582                       MachinePointerInfo(), MachinePointerInfo());
1583}
1584
1585/// IsTailCallConvention - Return true if the calling convention is one that
1586/// supports tail call optimization.
1587static bool IsTailCallConvention(CallingConv::ID CC) {
1588  return (CC == CallingConv::Fast || CC == CallingConv::GHC);
1589}
1590
1591bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
1592  if (!CI->isTailCall())
1593    return false;
1594
1595  CallSite CS(CI);
1596  CallingConv::ID CalleeCC = CS.getCallingConv();
1597  if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
1598    return false;
1599
1600  return true;
1601}
1602
1603/// FuncIsMadeTailCallSafe - Return true if the function is being made into
1604/// a tailcall target by changing its ABI.
1605static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) {
1606  return GuaranteedTailCallOpt && IsTailCallConvention(CC);
1607}
1608
1609SDValue
1610X86TargetLowering::LowerMemArgument(SDValue Chain,
1611                                    CallingConv::ID CallConv,
1612                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1613                                    DebugLoc dl, SelectionDAG &DAG,
1614                                    const CCValAssign &VA,
1615                                    MachineFrameInfo *MFI,
1616                                    unsigned i) const {
1617  // Create the nodes corresponding to a load from this parameter slot.
1618  ISD::ArgFlagsTy Flags = Ins[i].Flags;
1619  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv);
1620  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1621  EVT ValVT;
1622
1623  // If value is passed by pointer we have address passed instead of the value
1624  // itself.
1625  if (VA.getLocInfo() == CCValAssign::Indirect)
1626    ValVT = VA.getLocVT();
1627  else
1628    ValVT = VA.getValVT();
1629
1630  // FIXME: For now, all byval parameter objects are marked mutable. This can be
1631  // changed with more analysis.
1632  // In case of tail call optimization mark all arguments mutable. Since they
1633  // could be overwritten by lowering of arguments in case of a tail call.
1634  if (Flags.isByVal()) {
1635    unsigned Bytes = Flags.getByValSize();
1636    if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
1637    int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
1638    return DAG.getFrameIndex(FI, getPointerTy());
1639  } else {
1640    int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
1641                                    VA.getLocMemOffset(), isImmutable);
1642    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1643    return DAG.getLoad(ValVT, dl, Chain, FIN,
1644                       MachinePointerInfo::getFixedStack(FI),
1645                       false, false, 0);
1646  }
1647}
1648
1649SDValue
1650X86TargetLowering::LowerFormalArguments(SDValue Chain,
1651                                        CallingConv::ID CallConv,
1652                                        bool isVarArg,
1653                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1654                                        DebugLoc dl,
1655                                        SelectionDAG &DAG,
1656                                        SmallVectorImpl<SDValue> &InVals)
1657                                          const {
1658  MachineFunction &MF = DAG.getMachineFunction();
1659  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1660
1661  const Function* Fn = MF.getFunction();
1662  if (Fn->hasExternalLinkage() &&
1663      Subtarget->isTargetCygMing() &&
1664      Fn->getName() == "main")
1665    FuncInfo->setForceFramePointer(true);
1666
1667  MachineFrameInfo *MFI = MF.getFrameInfo();
1668  bool Is64Bit = Subtarget->is64Bit();
1669  bool IsWin64 = Subtarget->isTargetWin64();
1670
1671  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
1672         "Var args not supported with calling convention fastcc or ghc");
1673
1674  // Assign locations to all of the incoming arguments.
1675  SmallVector<CCValAssign, 16> ArgLocs;
1676  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1677                 ArgLocs, *DAG.getContext());
1678
1679  // Allocate shadow area for Win64
1680  if (IsWin64) {
1681    CCInfo.AllocateStack(32, 8);
1682  }
1683
1684  CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
1685
1686  unsigned LastVal = ~0U;
1687  SDValue ArgValue;
1688  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1689    CCValAssign &VA = ArgLocs[i];
1690    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
1691    // places.
1692    assert(VA.getValNo() != LastVal &&
1693           "Don't support value assigned to multiple locs yet");
1694    LastVal = VA.getValNo();
1695
1696    if (VA.isRegLoc()) {
1697      EVT RegVT = VA.getLocVT();
1698      TargetRegisterClass *RC = NULL;
1699      if (RegVT == MVT::i32)
1700        RC = X86::GR32RegisterClass;
1701      else if (Is64Bit && RegVT == MVT::i64)
1702        RC = X86::GR64RegisterClass;
1703      else if (RegVT == MVT::f32)
1704        RC = X86::FR32RegisterClass;
1705      else if (RegVT == MVT::f64)
1706        RC = X86::FR64RegisterClass;
1707      else if (RegVT.isVector() && RegVT.getSizeInBits() == 256)
1708        RC = X86::VR256RegisterClass;
1709      else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
1710        RC = X86::VR128RegisterClass;
1711      else if (RegVT == MVT::x86mmx)
1712        RC = X86::VR64RegisterClass;
1713      else
1714        llvm_unreachable("Unknown argument type!");
1715
1716      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1717      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1718
1719      // If this is an 8 or 16-bit value, it is really passed promoted to 32
1720      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
1721      // right size.
1722      if (VA.getLocInfo() == CCValAssign::SExt)
1723        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
1724                               DAG.getValueType(VA.getValVT()));
1725      else if (VA.getLocInfo() == CCValAssign::ZExt)
1726        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
1727                               DAG.getValueType(VA.getValVT()));
1728      else if (VA.getLocInfo() == CCValAssign::BCvt)
1729        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
1730
1731      if (VA.isExtInLoc()) {
1732        // Handle MMX values passed in XMM regs.
1733        if (RegVT.isVector()) {
1734          ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(),
1735                                 ArgValue);
1736        } else
1737          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
1738      }
1739    } else {
1740      assert(VA.isMemLoc());
1741      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
1742    }
1743
1744    // If value is passed via pointer - do a load.
1745    if (VA.getLocInfo() == CCValAssign::Indirect)
1746      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
1747                             MachinePointerInfo(), false, false, 0);
1748
1749    InVals.push_back(ArgValue);
1750  }
1751
1752  // The x86-64 ABI for returning structs by value requires that we copy
1753  // the sret argument into %rax for the return. Save the argument into
1754  // a virtual register so that we can access it from the return points.
1755  if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
1756    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1757    unsigned Reg = FuncInfo->getSRetReturnReg();
1758    if (!Reg) {
1759      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
1760      FuncInfo->setSRetReturnReg(Reg);
1761    }
1762    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
1763    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
1764  }
1765
1766  unsigned StackSize = CCInfo.getNextStackOffset();
1767  // Align stack specially for tail calls.
1768  if (FuncIsMadeTailCallSafe(CallConv))
1769    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1770
1771  // If the function takes variable number of arguments, make a frame index for
1772  // the start of the first vararg value... for expansion of llvm.va_start.
1773  if (isVarArg) {
1774    if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
1775                    CallConv != CallingConv::X86_ThisCall)) {
1776      FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
1777    }
1778    if (Is64Bit) {
1779      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
1780
1781      // FIXME: We should really autogenerate these arrays
1782      static const unsigned GPR64ArgRegsWin64[] = {
1783        X86::RCX, X86::RDX, X86::R8,  X86::R9
1784      };
1785      static const unsigned GPR64ArgRegs64Bit[] = {
1786        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1787      };
1788      static const unsigned XMMArgRegs64Bit[] = {
1789        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1790        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1791      };
1792      const unsigned *GPR64ArgRegs;
1793      unsigned NumXMMRegs = 0;
1794
1795      if (IsWin64) {
1796        // The XMM registers which might contain var arg parameters are shadowed
1797        // in their paired GPR.  So we only need to save the GPR to their home
1798        // slots.
1799        TotalNumIntRegs = 4;
1800        GPR64ArgRegs = GPR64ArgRegsWin64;
1801      } else {
1802        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
1803        GPR64ArgRegs = GPR64ArgRegs64Bit;
1804
1805        NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs);
1806      }
1807      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
1808                                                       TotalNumIntRegs);
1809
1810      bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
1811      assert(!(NumXMMRegs && !Subtarget->hasXMM()) &&
1812             "SSE register cannot be used when SSE is disabled!");
1813      assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) &&
1814             "SSE register cannot be used when SSE is disabled!");
1815      if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM())
1816        // Kernel mode asks for SSE to be disabled, so don't push them
1817        // on the stack.
1818        TotalNumXMMRegs = 0;
1819
1820      if (IsWin64) {
1821        const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
1822        // Get to the caller-allocated home save location.  Add 8 to account
1823        // for the return address.
1824        int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
1825        FuncInfo->setRegSaveFrameIndex(
1826          MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
1827        // Fixup to set vararg frame on shadow area (4 x i64).
1828        if (NumIntRegs < 4)
1829          FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
1830      } else {
1831        // For X86-64, if there are vararg parameters that are passed via
1832        // registers, then we must store them to their spots on the stack so they
1833        // may be loaded by deferencing the result of va_next.
1834        FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
1835        FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
1836        FuncInfo->setRegSaveFrameIndex(
1837          MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
1838                               false));
1839      }
1840
1841      // Store the integer parameter registers.
1842      SmallVector<SDValue, 8> MemOps;
1843      SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
1844                                        getPointerTy());
1845      unsigned Offset = FuncInfo->getVarArgsGPOffset();
1846      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
1847        SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
1848                                  DAG.getIntPtrConstant(Offset));
1849        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
1850                                     X86::GR64RegisterClass);
1851        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
1852        SDValue Store =
1853          DAG.getStore(Val.getValue(1), dl, Val, FIN,
1854                       MachinePointerInfo::getFixedStack(
1855                         FuncInfo->getRegSaveFrameIndex(), Offset),
1856                       false, false, 0);
1857        MemOps.push_back(Store);
1858        Offset += 8;
1859      }
1860
1861      if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
1862        // Now store the XMM (fp + vector) parameter registers.
1863        SmallVector<SDValue, 11> SaveXMMOps;
1864        SaveXMMOps.push_back(Chain);
1865
1866        unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass);
1867        SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
1868        SaveXMMOps.push_back(ALVal);
1869
1870        SaveXMMOps.push_back(DAG.getIntPtrConstant(
1871                               FuncInfo->getRegSaveFrameIndex()));
1872        SaveXMMOps.push_back(DAG.getIntPtrConstant(
1873                               FuncInfo->getVarArgsFPOffset()));
1874
1875        for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
1876          unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
1877                                       X86::VR128RegisterClass);
1878          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
1879          SaveXMMOps.push_back(Val);
1880        }
1881        MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
1882                                     MVT::Other,
1883                                     &SaveXMMOps[0], SaveXMMOps.size()));
1884      }
1885
1886      if (!MemOps.empty())
1887        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1888                            &MemOps[0], MemOps.size());
1889    }
1890  }
1891
1892  // Some CCs need callee pop.
1893  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) {
1894    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
1895  } else {
1896    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
1897    // If this is an sret function, the return should pop the hidden pointer.
1898    if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins))
1899      FuncInfo->setBytesToPopOnReturn(4);
1900  }
1901
1902  if (!Is64Bit) {
1903    // RegSaveFrameIndex is X86-64 only.
1904    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
1905    if (CallConv == CallingConv::X86_FastCall ||
1906        CallConv == CallingConv::X86_ThisCall)
1907      // fastcc functions can't have varargs.
1908      FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
1909  }
1910
1911  return Chain;
1912}
1913
1914SDValue
1915X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
1916                                    SDValue StackPtr, SDValue Arg,
1917                                    DebugLoc dl, SelectionDAG &DAG,
1918                                    const CCValAssign &VA,
1919                                    ISD::ArgFlagsTy Flags) const {
1920  unsigned LocMemOffset = VA.getLocMemOffset();
1921  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
1922  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
1923  if (Flags.isByVal())
1924    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
1925
1926  return DAG.getStore(Chain, dl, Arg, PtrOff,
1927                      MachinePointerInfo::getStack(LocMemOffset),
1928                      false, false, 0);
1929}
1930
1931/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
1932/// optimization is performed and it is required.
1933SDValue
1934X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
1935                                           SDValue &OutRetAddr, SDValue Chain,
1936                                           bool IsTailCall, bool Is64Bit,
1937                                           int FPDiff, DebugLoc dl) const {
1938  // Adjust the Return address stack slot.
1939  EVT VT = getPointerTy();
1940  OutRetAddr = getReturnAddressFrameIndex(DAG);
1941
1942  // Load the "old" Return address.
1943  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
1944                           false, false, 0);
1945  return SDValue(OutRetAddr.getNode(), 1);
1946}
1947
1948/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
1949/// optimization is performed and it is required (FPDiff!=0).
1950static SDValue
1951EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
1952                         SDValue Chain, SDValue RetAddrFrIdx,
1953                         bool Is64Bit, int FPDiff, DebugLoc dl) {
1954  // Store the return address to the appropriate stack slot.
1955  if (!FPDiff) return Chain;
1956  // Calculate the new stack slot for the return address.
1957  int SlotSize = Is64Bit ? 8 : 4;
1958  int NewReturnAddrFI =
1959    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
1960  EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
1961  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
1962  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
1963                       MachinePointerInfo::getFixedStack(NewReturnAddrFI),
1964                       false, false, 0);
1965  return Chain;
1966}
1967
1968SDValue
1969X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
1970                             CallingConv::ID CallConv, bool isVarArg,
1971                             bool &isTailCall,
1972                             const SmallVectorImpl<ISD::OutputArg> &Outs,
1973                             const SmallVectorImpl<SDValue> &OutVals,
1974                             const SmallVectorImpl<ISD::InputArg> &Ins,
1975                             DebugLoc dl, SelectionDAG &DAG,
1976                             SmallVectorImpl<SDValue> &InVals) const {
1977  MachineFunction &MF = DAG.getMachineFunction();
1978  bool Is64Bit        = Subtarget->is64Bit();
1979  bool IsWin64        = Subtarget->isTargetWin64();
1980  bool IsStructRet    = CallIsStructReturn(Outs);
1981  bool IsSibcall      = false;
1982
1983  if (isTailCall) {
1984    // Check if it's really possible to do a tail call.
1985    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
1986                    isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
1987                                                   Outs, OutVals, Ins, DAG);
1988
1989    // Sibcalls are automatically detected tailcalls which do not require
1990    // ABI changes.
1991    if (!GuaranteedTailCallOpt && isTailCall)
1992      IsSibcall = true;
1993
1994    if (isTailCall)
1995      ++NumTailCalls;
1996  }
1997
1998  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
1999         "Var args not supported with calling convention fastcc or ghc");
2000
2001  // Analyze operands of the call, assigning locations to each operand.
2002  SmallVector<CCValAssign, 16> ArgLocs;
2003  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
2004                 ArgLocs, *DAG.getContext());
2005
2006  // Allocate shadow area for Win64
2007  if (IsWin64) {
2008    CCInfo.AllocateStack(32, 8);
2009  }
2010
2011  CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2012
2013  // Get a count of how many bytes are to be pushed on the stack.
2014  unsigned NumBytes = CCInfo.getNextStackOffset();
2015  if (IsSibcall)
2016    // This is a sibcall. The memory operands are available in caller's
2017    // own caller's stack.
2018    NumBytes = 0;
2019  else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv))
2020    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2021
2022  int FPDiff = 0;
2023  if (isTailCall && !IsSibcall) {
2024    // Lower arguments at fp - stackoffset + fpdiff.
2025    unsigned NumBytesCallerPushed =
2026      MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
2027    FPDiff = NumBytesCallerPushed - NumBytes;
2028
2029    // Set the delta of movement of the returnaddr stackslot.
2030    // But only set if delta is greater than previous delta.
2031    if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
2032      MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
2033  }
2034
2035  if (!IsSibcall)
2036    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
2037
2038  SDValue RetAddrFrIdx;
2039  // Load return address for tail calls.
2040  if (isTailCall && FPDiff)
2041    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2042                                    Is64Bit, FPDiff, dl);
2043
2044  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2045  SmallVector<SDValue, 8> MemOpChains;
2046  SDValue StackPtr;
2047
2048  // Walk the register/memloc assignments, inserting copies/loads.  In the case
2049  // of tail call optimization arguments are handle later.
2050  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2051    CCValAssign &VA = ArgLocs[i];
2052    EVT RegVT = VA.getLocVT();
2053    SDValue Arg = OutVals[i];
2054    ISD::ArgFlagsTy Flags = Outs[i].Flags;
2055    bool isByVal = Flags.isByVal();
2056
2057    // Promote the value if needed.
2058    switch (VA.getLocInfo()) {
2059    default: llvm_unreachable("Unknown loc info!");
2060    case CCValAssign::Full: break;
2061    case CCValAssign::SExt:
2062      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2063      break;
2064    case CCValAssign::ZExt:
2065      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2066      break;
2067    case CCValAssign::AExt:
2068      if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
2069        // Special case: passing MMX values in XMM registers.
2070        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2071        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2072        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2073      } else
2074        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2075      break;
2076    case CCValAssign::BCvt:
2077      Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2078      break;
2079    case CCValAssign::Indirect: {
2080      // Store the argument.
2081      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2082      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2083      Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2084                           MachinePointerInfo::getFixedStack(FI),
2085                           false, false, 0);
2086      Arg = SpillSlot;
2087      break;
2088    }
2089    }
2090
2091    if (VA.isRegLoc()) {
2092      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2093      if (isVarArg && IsWin64) {
2094        // Win64 ABI requires argument XMM reg to be copied to the corresponding
2095        // shadow reg if callee is a varargs function.
2096        unsigned ShadowReg = 0;
2097        switch (VA.getLocReg()) {
2098        case X86::XMM0: ShadowReg = X86::RCX; break;
2099        case X86::XMM1: ShadowReg = X86::RDX; break;
2100        case X86::XMM2: ShadowReg = X86::R8; break;
2101        case X86::XMM3: ShadowReg = X86::R9; break;
2102        }
2103        if (ShadowReg)
2104          RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2105      }
2106    } else if (!IsSibcall && (!isTailCall || isByVal)) {
2107      assert(VA.isMemLoc());
2108      if (StackPtr.getNode() == 0)
2109        StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
2110      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2111                                             dl, DAG, VA, Flags));
2112    }
2113  }
2114
2115  if (!MemOpChains.empty())
2116    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2117                        &MemOpChains[0], MemOpChains.size());
2118
2119  // Build a sequence of copy-to-reg nodes chained together with token chain
2120  // and flag operands which copy the outgoing args into registers.
2121  SDValue InFlag;
2122  // Tail call byval lowering might overwrite argument registers so in case of
2123  // tail call optimization the copies to registers are lowered later.
2124  if (!isTailCall)
2125    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2126      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2127                               RegsToPass[i].second, InFlag);
2128      InFlag = Chain.getValue(1);
2129    }
2130
2131  if (Subtarget->isPICStyleGOT()) {
2132    // ELF / PIC requires GOT in the EBX register before function calls via PLT
2133    // GOT pointer.
2134    if (!isTailCall) {
2135      Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
2136                               DAG.getNode(X86ISD::GlobalBaseReg,
2137                                           DebugLoc(), getPointerTy()),
2138                               InFlag);
2139      InFlag = Chain.getValue(1);
2140    } else {
2141      // If we are tail calling and generating PIC/GOT style code load the
2142      // address of the callee into ECX. The value in ecx is used as target of
2143      // the tail jump. This is done to circumvent the ebx/callee-saved problem
2144      // for tail calls on PIC/GOT architectures. Normally we would just put the
2145      // address of GOT into ebx and then call target@PLT. But for tail calls
2146      // ebx would be restored (since ebx is callee saved) before jumping to the
2147      // target@PLT.
2148
2149      // Note: The actual moving to ECX is done further down.
2150      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2151      if (G && !G->getGlobal()->hasHiddenVisibility() &&
2152          !G->getGlobal()->hasProtectedVisibility())
2153        Callee = LowerGlobalAddress(Callee, DAG);
2154      else if (isa<ExternalSymbolSDNode>(Callee))
2155        Callee = LowerExternalSymbol(Callee, DAG);
2156    }
2157  }
2158
2159  if (Is64Bit && isVarArg && !IsWin64) {
2160    // From AMD64 ABI document:
2161    // For calls that may call functions that use varargs or stdargs
2162    // (prototype-less calls or calls to functions containing ellipsis (...) in
2163    // the declaration) %al is used as hidden argument to specify the number
2164    // of SSE registers used. The contents of %al do not need to match exactly
2165    // the number of registers, but must be an ubound on the number of SSE
2166    // registers used and is in the range 0 - 8 inclusive.
2167
2168    // Count the number of XMM registers allocated.
2169    static const unsigned XMMArgRegs[] = {
2170      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2171      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2172    };
2173    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
2174    assert((Subtarget->hasXMM() || !NumXMMRegs)
2175           && "SSE registers cannot be used when SSE is disabled");
2176
2177    Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
2178                             DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
2179    InFlag = Chain.getValue(1);
2180  }
2181
2182
2183  // For tail calls lower the arguments to the 'real' stack slot.
2184  if (isTailCall) {
2185    // Force all the incoming stack arguments to be loaded from the stack
2186    // before any new outgoing arguments are stored to the stack, because the
2187    // outgoing stack slots may alias the incoming argument stack slots, and
2188    // the alias isn't otherwise explicit. This is slightly more conservative
2189    // than necessary, because it means that each store effectively depends
2190    // on every argument instead of just those arguments it would clobber.
2191    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
2192
2193    SmallVector<SDValue, 8> MemOpChains2;
2194    SDValue FIN;
2195    int FI = 0;
2196    // Do not flag preceding copytoreg stuff together with the following stuff.
2197    InFlag = SDValue();
2198    if (GuaranteedTailCallOpt) {
2199      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2200        CCValAssign &VA = ArgLocs[i];
2201        if (VA.isRegLoc())
2202          continue;
2203        assert(VA.isMemLoc());
2204        SDValue Arg = OutVals[i];
2205        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2206        // Create frame index.
2207        int32_t Offset = VA.getLocMemOffset()+FPDiff;
2208        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
2209        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
2210        FIN = DAG.getFrameIndex(FI, getPointerTy());
2211
2212        if (Flags.isByVal()) {
2213          // Copy relative to framepointer.
2214          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
2215          if (StackPtr.getNode() == 0)
2216            StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
2217                                          getPointerTy());
2218          Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
2219
2220          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
2221                                                           ArgChain,
2222                                                           Flags, DAG, dl));
2223        } else {
2224          // Store relative to framepointer.
2225          MemOpChains2.push_back(
2226            DAG.getStore(ArgChain, dl, Arg, FIN,
2227                         MachinePointerInfo::getFixedStack(FI),
2228                         false, false, 0));
2229        }
2230      }
2231    }
2232
2233    if (!MemOpChains2.empty())
2234      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2235                          &MemOpChains2[0], MemOpChains2.size());
2236
2237    // Copy arguments to their registers.
2238    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2239      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2240                               RegsToPass[i].second, InFlag);
2241      InFlag = Chain.getValue(1);
2242    }
2243    InFlag =SDValue();
2244
2245    // Store the return address to the appropriate stack slot.
2246    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
2247                                     FPDiff, dl);
2248  }
2249
2250  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
2251    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2252    // In the 64-bit large code model, we have to make all calls
2253    // through a register, since the call instruction's 32-bit
2254    // pc-relative offset may not be large enough to hold the whole
2255    // address.
2256  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2257    // If the callee is a GlobalAddress node (quite common, every direct call
2258    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
2259    // it.
2260
2261    // We should use extra load for direct calls to dllimported functions in
2262    // non-JIT mode.
2263    const GlobalValue *GV = G->getGlobal();
2264    if (!GV->hasDLLImportLinkage()) {
2265      unsigned char OpFlags = 0;
2266      bool ExtraLoad = false;
2267      unsigned WrapperKind = ISD::DELETED_NODE;
2268
2269      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
2270      // external symbols most go through the PLT in PIC mode.  If the symbol
2271      // has hidden or protected visibility, or if it is static or local, then
2272      // we don't need to use the PLT - we can directly call it.
2273      if (Subtarget->isTargetELF() &&
2274          getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
2275          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
2276        OpFlags = X86II::MO_PLT;
2277      } else if (Subtarget->isPICStyleStubAny() &&
2278                 (GV->isDeclaration() || GV->isWeakForLinker()) &&
2279                 (!Subtarget->getTargetTriple().isMacOSX() ||
2280                  Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
2281        // PC-relative references to external symbols should go through $stub,
2282        // unless we're building with the leopard linker or later, which
2283        // automatically synthesizes these stubs.
2284        OpFlags = X86II::MO_DARWIN_STUB;
2285      } else if (Subtarget->isPICStyleRIPRel() &&
2286                 isa<Function>(GV) &&
2287                 cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) {
2288        // If the function is marked as non-lazy, generate an indirect call
2289        // which loads from the GOT directly. This avoids runtime overhead
2290        // at the cost of eager binding (and one extra byte of encoding).
2291        OpFlags = X86II::MO_GOTPCREL;
2292        WrapperKind = X86ISD::WrapperRIP;
2293        ExtraLoad = true;
2294      }
2295
2296      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
2297                                          G->getOffset(), OpFlags);
2298
2299      // Add a wrapper if needed.
2300      if (WrapperKind != ISD::DELETED_NODE)
2301        Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
2302      // Add extra indirection if needed.
2303      if (ExtraLoad)
2304        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
2305                             MachinePointerInfo::getGOT(),
2306                             false, false, 0);
2307    }
2308  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2309    unsigned char OpFlags = 0;
2310
2311    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
2312    // external symbols should go through the PLT.
2313    if (Subtarget->isTargetELF() &&
2314        getTargetMachine().getRelocationModel() == Reloc::PIC_) {
2315      OpFlags = X86II::MO_PLT;
2316    } else if (Subtarget->isPICStyleStubAny() &&
2317               (!Subtarget->getTargetTriple().isMacOSX() ||
2318                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
2319      // PC-relative references to external symbols should go through $stub,
2320      // unless we're building with the leopard linker or later, which
2321      // automatically synthesizes these stubs.
2322      OpFlags = X86II::MO_DARWIN_STUB;
2323    }
2324
2325    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
2326                                         OpFlags);
2327  }
2328
2329  // Returns a chain & a flag for retval copy to use.
2330  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2331  SmallVector<SDValue, 8> Ops;
2332
2333  if (!IsSibcall && isTailCall) {
2334    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
2335                           DAG.getIntPtrConstant(0, true), InFlag);
2336    InFlag = Chain.getValue(1);
2337  }
2338
2339  Ops.push_back(Chain);
2340  Ops.push_back(Callee);
2341
2342  if (isTailCall)
2343    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
2344
2345  // Add argument registers to the end of the list so that they are known live
2346  // into the call.
2347  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2348    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2349                                  RegsToPass[i].second.getValueType()));
2350
2351  // Add an implicit use GOT pointer in EBX.
2352  if (!isTailCall && Subtarget->isPICStyleGOT())
2353    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
2354
2355  // Add an implicit use of AL for non-Windows x86 64-bit vararg functions.
2356  if (Is64Bit && isVarArg && !IsWin64)
2357    Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
2358
2359  if (InFlag.getNode())
2360    Ops.push_back(InFlag);
2361
2362  if (isTailCall) {
2363    // We used to do:
2364    //// If this is the first return lowered for this function, add the regs
2365    //// to the liveout set for the function.
2366    // This isn't right, although it's probably harmless on x86; liveouts
2367    // should be computed from returns not tail calls.  Consider a void
2368    // function making a tail call to a function returning int.
2369    return DAG.getNode(X86ISD::TC_RETURN, dl,
2370                       NodeTys, &Ops[0], Ops.size());
2371  }
2372
2373  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
2374  InFlag = Chain.getValue(1);
2375
2376  // Create the CALLSEQ_END node.
2377  unsigned NumBytesForCalleeToPush;
2378  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt))
2379    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
2380  else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet)
2381    // If this is a call to a struct-return function, the callee
2382    // pops the hidden struct pointer, so we have to push it back.
2383    // This is common for Darwin/X86, Linux & Mingw32 targets.
2384    NumBytesForCalleeToPush = 4;
2385  else
2386    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
2387
2388  // Returns a flag for retval copy to use.
2389  if (!IsSibcall) {
2390    Chain = DAG.getCALLSEQ_END(Chain,
2391                               DAG.getIntPtrConstant(NumBytes, true),
2392                               DAG.getIntPtrConstant(NumBytesForCalleeToPush,
2393                                                     true),
2394                               InFlag);
2395    InFlag = Chain.getValue(1);
2396  }
2397
2398  // Handle result values, copying them out of physregs into vregs that we
2399  // return.
2400  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
2401                         Ins, dl, DAG, InVals);
2402}
2403
2404
2405//===----------------------------------------------------------------------===//
2406//                Fast Calling Convention (tail call) implementation
2407//===----------------------------------------------------------------------===//
2408
2409//  Like std call, callee cleans arguments, convention except that ECX is
2410//  reserved for storing the tail called function address. Only 2 registers are
2411//  free for argument passing (inreg). Tail call optimization is performed
2412//  provided:
2413//                * tailcallopt is enabled
2414//                * caller/callee are fastcc
2415//  On X86_64 architecture with GOT-style position independent code only local
2416//  (within module) calls are supported at the moment.
2417//  To keep the stack aligned according to platform abi the function
2418//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
2419//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
2420//  If a tail called function callee has more arguments than the caller the
2421//  caller needs to make sure that there is room to move the RETADDR to. This is
2422//  achieved by reserving an area the size of the argument delta right after the
2423//  original REtADDR, but before the saved framepointer or the spilled registers
2424//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2425//  stack layout:
2426//    arg1
2427//    arg2
2428//    RETADDR
2429//    [ new RETADDR
2430//      move area ]
2431//    (possible EBP)
2432//    ESI
2433//    EDI
2434//    local1 ..
2435
2436/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
2437/// for a 16 byte align requirement.
2438unsigned
2439X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
2440                                               SelectionDAG& DAG) const {
2441  MachineFunction &MF = DAG.getMachineFunction();
2442  const TargetMachine &TM = MF.getTarget();
2443  const TargetFrameLowering &TFI = *TM.getFrameLowering();
2444  unsigned StackAlignment = TFI.getStackAlignment();
2445  uint64_t AlignMask = StackAlignment - 1;
2446  int64_t Offset = StackSize;
2447  uint64_t SlotSize = TD->getPointerSize();
2448  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
2449    // Number smaller than 12 so just add the difference.
2450    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
2451  } else {
2452    // Mask out lower bits, add stackalignment once plus the 12 bytes.
2453    Offset = ((~AlignMask) & Offset) + StackAlignment +
2454      (StackAlignment-SlotSize);
2455  }
2456  return Offset;
2457}
2458
2459/// MatchingStackOffset - Return true if the given stack call argument is
2460/// already available in the same position (relatively) of the caller's
2461/// incoming argument stack.
2462static
2463bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2464                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
2465                         const X86InstrInfo *TII) {
2466  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
2467  int FI = INT_MAX;
2468  if (Arg.getOpcode() == ISD::CopyFromReg) {
2469    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2470    if (!TargetRegisterInfo::isVirtualRegister(VR))
2471      return false;
2472    MachineInstr *Def = MRI->getVRegDef(VR);
2473    if (!Def)
2474      return false;
2475    if (!Flags.isByVal()) {
2476      if (!TII->isLoadFromStackSlot(Def, FI))
2477        return false;
2478    } else {
2479      unsigned Opcode = Def->getOpcode();
2480      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
2481          Def->getOperand(1).isFI()) {
2482        FI = Def->getOperand(1).getIndex();
2483        Bytes = Flags.getByValSize();
2484      } else
2485        return false;
2486    }
2487  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2488    if (Flags.isByVal())
2489      // ByVal argument is passed in as a pointer but it's now being
2490      // dereferenced. e.g.
2491      // define @foo(%struct.X* %A) {
2492      //   tail call @bar(%struct.X* byval %A)
2493      // }
2494      return false;
2495    SDValue Ptr = Ld->getBasePtr();
2496    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2497    if (!FINode)
2498      return false;
2499    FI = FINode->getIndex();
2500  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
2501    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
2502    FI = FINode->getIndex();
2503    Bytes = Flags.getByValSize();
2504  } else
2505    return false;
2506
2507  assert(FI != INT_MAX);
2508  if (!MFI->isFixedObjectIndex(FI))
2509    return false;
2510  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
2511}
2512
2513/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2514/// for tail call optimization. Targets which want to do tail call
2515/// optimization should implement this function.
2516bool
2517X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2518                                                     CallingConv::ID CalleeCC,
2519                                                     bool isVarArg,
2520                                                     bool isCalleeStructRet,
2521                                                     bool isCallerStructRet,
2522                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
2523                                    const SmallVectorImpl<SDValue> &OutVals,
2524                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2525                                                     SelectionDAG& DAG) const {
2526  if (!IsTailCallConvention(CalleeCC) &&
2527      CalleeCC != CallingConv::C)
2528    return false;
2529
2530  // If -tailcallopt is specified, make fastcc functions tail-callable.
2531  const MachineFunction &MF = DAG.getMachineFunction();
2532  const Function *CallerF = DAG.getMachineFunction().getFunction();
2533  CallingConv::ID CallerCC = CallerF->getCallingConv();
2534  bool CCMatch = CallerCC == CalleeCC;
2535
2536  if (GuaranteedTailCallOpt) {
2537    if (IsTailCallConvention(CalleeCC) && CCMatch)
2538      return true;
2539    return false;
2540  }
2541
2542  // Look for obvious safe cases to perform tail call optimization that do not
2543  // require ABI changes. This is what gcc calls sibcall.
2544
2545  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
2546  // emit a special epilogue.
2547  if (RegInfo->needsStackRealignment(MF))
2548    return false;
2549
2550  // Also avoid sibcall optimization if either caller or callee uses struct
2551  // return semantics.
2552  if (isCalleeStructRet || isCallerStructRet)
2553    return false;
2554
2555  // An stdcall caller is expected to clean up its arguments; the callee
2556  // isn't going to do that.
2557  if (!CCMatch && CallerCC==CallingConv::X86_StdCall)
2558    return false;
2559
2560  // Do not sibcall optimize vararg calls unless all arguments are passed via
2561  // registers.
2562  if (isVarArg && !Outs.empty()) {
2563
2564    // Optimizing for varargs on Win64 is unlikely to be safe without
2565    // additional testing.
2566    if (Subtarget->isTargetWin64())
2567      return false;
2568
2569    SmallVector<CCValAssign, 16> ArgLocs;
2570    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
2571		   getTargetMachine(), ArgLocs, *DAG.getContext());
2572
2573    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2574    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
2575      if (!ArgLocs[i].isRegLoc())
2576        return false;
2577  }
2578
2579  // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack.
2580  // Therefore if it's not used by the call it is not safe to optimize this into
2581  // a sibcall.
2582  bool Unused = false;
2583  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
2584    if (!Ins[i].Used) {
2585      Unused = true;
2586      break;
2587    }
2588  }
2589  if (Unused) {
2590    SmallVector<CCValAssign, 16> RVLocs;
2591    CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
2592		   getTargetMachine(), RVLocs, *DAG.getContext());
2593    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2594    for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2595      CCValAssign &VA = RVLocs[i];
2596      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
2597        return false;
2598    }
2599  }
2600
2601  // If the calling conventions do not match, then we'd better make sure the
2602  // results are returned in the same way as what the caller expects.
2603  if (!CCMatch) {
2604    SmallVector<CCValAssign, 16> RVLocs1;
2605    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
2606		    getTargetMachine(), RVLocs1, *DAG.getContext());
2607    CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
2608
2609    SmallVector<CCValAssign, 16> RVLocs2;
2610    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
2611		    getTargetMachine(), RVLocs2, *DAG.getContext());
2612    CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
2613
2614    if (RVLocs1.size() != RVLocs2.size())
2615      return false;
2616    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
2617      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
2618        return false;
2619      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
2620        return false;
2621      if (RVLocs1[i].isRegLoc()) {
2622        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
2623          return false;
2624      } else {
2625        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
2626          return false;
2627      }
2628    }
2629  }
2630
2631  // If the callee takes no arguments then go on to check the results of the
2632  // call.
2633  if (!Outs.empty()) {
2634    // Check if stack adjustment is needed. For now, do not do this if any
2635    // argument is passed on the stack.
2636    SmallVector<CCValAssign, 16> ArgLocs;
2637    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
2638		   getTargetMachine(), ArgLocs, *DAG.getContext());
2639
2640    // Allocate shadow area for Win64
2641    if (Subtarget->isTargetWin64()) {
2642      CCInfo.AllocateStack(32, 8);
2643    }
2644
2645    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2646    if (CCInfo.getNextStackOffset()) {
2647      MachineFunction &MF = DAG.getMachineFunction();
2648      if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
2649        return false;
2650
2651      // Check if the arguments are already laid out in the right way as
2652      // the caller's fixed stack objects.
2653      MachineFrameInfo *MFI = MF.getFrameInfo();
2654      const MachineRegisterInfo *MRI = &MF.getRegInfo();
2655      const X86InstrInfo *TII =
2656        ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
2657      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2658        CCValAssign &VA = ArgLocs[i];
2659        SDValue Arg = OutVals[i];
2660        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2661        if (VA.getLocInfo() == CCValAssign::Indirect)
2662          return false;
2663        if (!VA.isRegLoc()) {
2664          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2665                                   MFI, MRI, TII))
2666            return false;
2667        }
2668      }
2669    }
2670
2671    // If the tailcall address may be in a register, then make sure it's
2672    // possible to register allocate for it. In 32-bit, the call address can
2673    // only target EAX, EDX, or ECX since the tail call must be scheduled after
2674    // callee-saved registers are restored. These happen to be the same
2675    // registers used to pass 'inreg' arguments so watch out for those.
2676    if (!Subtarget->is64Bit() &&
2677        !isa<GlobalAddressSDNode>(Callee) &&
2678        !isa<ExternalSymbolSDNode>(Callee)) {
2679      unsigned NumInRegs = 0;
2680      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2681        CCValAssign &VA = ArgLocs[i];
2682        if (!VA.isRegLoc())
2683          continue;
2684        unsigned Reg = VA.getLocReg();
2685        switch (Reg) {
2686        default: break;
2687        case X86::EAX: case X86::EDX: case X86::ECX:
2688          if (++NumInRegs == 3)
2689            return false;
2690          break;
2691        }
2692      }
2693    }
2694  }
2695
2696  return true;
2697}
2698
2699FastISel *
2700X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const {
2701  return X86::createFastISel(funcInfo);
2702}
2703
2704
2705//===----------------------------------------------------------------------===//
2706//                           Other Lowering Hooks
2707//===----------------------------------------------------------------------===//
2708
2709static bool MayFoldLoad(SDValue Op) {
2710  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
2711}
2712
2713static bool MayFoldIntoStore(SDValue Op) {
2714  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
2715}
2716
2717static bool isTargetShuffle(unsigned Opcode) {
2718  switch(Opcode) {
2719  default: return false;
2720  case X86ISD::PSHUFD:
2721  case X86ISD::PSHUFHW:
2722  case X86ISD::PSHUFLW:
2723  case X86ISD::SHUFPD:
2724  case X86ISD::PALIGN:
2725  case X86ISD::SHUFPS:
2726  case X86ISD::MOVLHPS:
2727  case X86ISD::MOVLHPD:
2728  case X86ISD::MOVHLPS:
2729  case X86ISD::MOVLPS:
2730  case X86ISD::MOVLPD:
2731  case X86ISD::MOVSHDUP:
2732  case X86ISD::MOVSLDUP:
2733  case X86ISD::MOVDDUP:
2734  case X86ISD::MOVSS:
2735  case X86ISD::MOVSD:
2736  case X86ISD::UNPCKLPS:
2737  case X86ISD::UNPCKLPD:
2738  case X86ISD::VUNPCKLPSY:
2739  case X86ISD::VUNPCKLPDY:
2740  case X86ISD::PUNPCKLWD:
2741  case X86ISD::PUNPCKLBW:
2742  case X86ISD::PUNPCKLDQ:
2743  case X86ISD::PUNPCKLQDQ:
2744  case X86ISD::UNPCKHPS:
2745  case X86ISD::UNPCKHPD:
2746  case X86ISD::VUNPCKHPSY:
2747  case X86ISD::VUNPCKHPDY:
2748  case X86ISD::PUNPCKHWD:
2749  case X86ISD::PUNPCKHBW:
2750  case X86ISD::PUNPCKHDQ:
2751  case X86ISD::PUNPCKHQDQ:
2752  case X86ISD::VPERMILPS:
2753  case X86ISD::VPERMILPSY:
2754  case X86ISD::VPERMILPD:
2755  case X86ISD::VPERMILPDY:
2756  case X86ISD::VPERM2F128:
2757    return true;
2758  }
2759  return false;
2760}
2761
2762static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
2763                                               SDValue V1, SelectionDAG &DAG) {
2764  switch(Opc) {
2765  default: llvm_unreachable("Unknown x86 shuffle node");
2766  case X86ISD::MOVSHDUP:
2767  case X86ISD::MOVSLDUP:
2768  case X86ISD::MOVDDUP:
2769    return DAG.getNode(Opc, dl, VT, V1);
2770  }
2771
2772  return SDValue();
2773}
2774
2775static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
2776                          SDValue V1, unsigned TargetMask, SelectionDAG &DAG) {
2777  switch(Opc) {
2778  default: llvm_unreachable("Unknown x86 shuffle node");
2779  case X86ISD::PSHUFD:
2780  case X86ISD::PSHUFHW:
2781  case X86ISD::PSHUFLW:
2782  case X86ISD::VPERMILPS:
2783  case X86ISD::VPERMILPSY:
2784  case X86ISD::VPERMILPD:
2785  case X86ISD::VPERMILPDY:
2786    return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
2787  }
2788
2789  return SDValue();
2790}
2791
2792static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
2793               SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) {
2794  switch(Opc) {
2795  default: llvm_unreachable("Unknown x86 shuffle node");
2796  case X86ISD::PALIGN:
2797  case X86ISD::SHUFPD:
2798  case X86ISD::SHUFPS:
2799  case X86ISD::VPERM2F128:
2800    return DAG.getNode(Opc, dl, VT, V1, V2,
2801                       DAG.getConstant(TargetMask, MVT::i8));
2802  }
2803  return SDValue();
2804}
2805
2806static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
2807                                    SDValue V1, SDValue V2, SelectionDAG &DAG) {
2808  switch(Opc) {
2809  default: llvm_unreachable("Unknown x86 shuffle node");
2810  case X86ISD::MOVLHPS:
2811  case X86ISD::MOVLHPD:
2812  case X86ISD::MOVHLPS:
2813  case X86ISD::MOVLPS:
2814  case X86ISD::MOVLPD:
2815  case X86ISD::MOVSS:
2816  case X86ISD::MOVSD:
2817  case X86ISD::UNPCKLPS:
2818  case X86ISD::UNPCKLPD:
2819  case X86ISD::VUNPCKLPSY:
2820  case X86ISD::VUNPCKLPDY:
2821  case X86ISD::PUNPCKLWD:
2822  case X86ISD::PUNPCKLBW:
2823  case X86ISD::PUNPCKLDQ:
2824  case X86ISD::PUNPCKLQDQ:
2825  case X86ISD::UNPCKHPS:
2826  case X86ISD::UNPCKHPD:
2827  case X86ISD::VUNPCKHPSY:
2828  case X86ISD::VUNPCKHPDY:
2829  case X86ISD::PUNPCKHWD:
2830  case X86ISD::PUNPCKHBW:
2831  case X86ISD::PUNPCKHDQ:
2832  case X86ISD::PUNPCKHQDQ:
2833    return DAG.getNode(Opc, dl, VT, V1, V2);
2834  }
2835  return SDValue();
2836}
2837
2838SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
2839  MachineFunction &MF = DAG.getMachineFunction();
2840  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2841  int ReturnAddrIndex = FuncInfo->getRAIndex();
2842
2843  if (ReturnAddrIndex == 0) {
2844    // Set up a frame object for the return address.
2845    uint64_t SlotSize = TD->getPointerSize();
2846    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
2847                                                           false);
2848    FuncInfo->setRAIndex(ReturnAddrIndex);
2849  }
2850
2851  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
2852}
2853
2854
2855bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
2856                                       bool hasSymbolicDisplacement) {
2857  // Offset should fit into 32 bit immediate field.
2858  if (!isInt<32>(Offset))
2859    return false;
2860
2861  // If we don't have a symbolic displacement - we don't have any extra
2862  // restrictions.
2863  if (!hasSymbolicDisplacement)
2864    return true;
2865
2866  // FIXME: Some tweaks might be needed for medium code model.
2867  if (M != CodeModel::Small && M != CodeModel::Kernel)
2868    return false;
2869
2870  // For small code model we assume that latest object is 16MB before end of 31
2871  // bits boundary. We may also accept pretty large negative constants knowing
2872  // that all objects are in the positive half of address space.
2873  if (M == CodeModel::Small && Offset < 16*1024*1024)
2874    return true;
2875
2876  // For kernel code model we know that all object resist in the negative half
2877  // of 32bits address space. We may not accept negative offsets, since they may
2878  // be just off and we may accept pretty large positive ones.
2879  if (M == CodeModel::Kernel && Offset > 0)
2880    return true;
2881
2882  return false;
2883}
2884
2885/// isCalleePop - Determines whether the callee is required to pop its
2886/// own arguments. Callee pop is necessary to support tail calls.
2887bool X86::isCalleePop(CallingConv::ID CallingConv,
2888                      bool is64Bit, bool IsVarArg, bool TailCallOpt) {
2889  if (IsVarArg)
2890    return false;
2891
2892  switch (CallingConv) {
2893  default:
2894    return false;
2895  case CallingConv::X86_StdCall:
2896    return !is64Bit;
2897  case CallingConv::X86_FastCall:
2898    return !is64Bit;
2899  case CallingConv::X86_ThisCall:
2900    return !is64Bit;
2901  case CallingConv::Fast:
2902    return TailCallOpt;
2903  case CallingConv::GHC:
2904    return TailCallOpt;
2905  }
2906}
2907
2908/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
2909/// specific condition code, returning the condition code and the LHS/RHS of the
2910/// comparison to make.
2911static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
2912                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
2913  if (!isFP) {
2914    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2915      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
2916        // X > -1   -> X == 0, jump !sign.
2917        RHS = DAG.getConstant(0, RHS.getValueType());
2918        return X86::COND_NS;
2919      } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
2920        // X < 0   -> X == 0, jump on sign.
2921        return X86::COND_S;
2922      } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
2923        // X < 1   -> X <= 0
2924        RHS = DAG.getConstant(0, RHS.getValueType());
2925        return X86::COND_LE;
2926      }
2927    }
2928
2929    switch (SetCCOpcode) {
2930    default: llvm_unreachable("Invalid integer condition!");
2931    case ISD::SETEQ:  return X86::COND_E;
2932    case ISD::SETGT:  return X86::COND_G;
2933    case ISD::SETGE:  return X86::COND_GE;
2934    case ISD::SETLT:  return X86::COND_L;
2935    case ISD::SETLE:  return X86::COND_LE;
2936    case ISD::SETNE:  return X86::COND_NE;
2937    case ISD::SETULT: return X86::COND_B;
2938    case ISD::SETUGT: return X86::COND_A;
2939    case ISD::SETULE: return X86::COND_BE;
2940    case ISD::SETUGE: return X86::COND_AE;
2941    }
2942  }
2943
2944  // First determine if it is required or is profitable to flip the operands.
2945
2946  // If LHS is a foldable load, but RHS is not, flip the condition.
2947  if (ISD::isNON_EXTLoad(LHS.getNode()) &&
2948      !ISD::isNON_EXTLoad(RHS.getNode())) {
2949    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2950    std::swap(LHS, RHS);
2951  }
2952
2953  switch (SetCCOpcode) {
2954  default: break;
2955  case ISD::SETOLT:
2956  case ISD::SETOLE:
2957  case ISD::SETUGT:
2958  case ISD::SETUGE:
2959    std::swap(LHS, RHS);
2960    break;
2961  }
2962
2963  // On a floating point condition, the flags are set as follows:
2964  // ZF  PF  CF   op
2965  //  0 | 0 | 0 | X > Y
2966  //  0 | 0 | 1 | X < Y
2967  //  1 | 0 | 0 | X == Y
2968  //  1 | 1 | 1 | unordered
2969  switch (SetCCOpcode) {
2970  default: llvm_unreachable("Condcode should be pre-legalized away");
2971  case ISD::SETUEQ:
2972  case ISD::SETEQ:   return X86::COND_E;
2973  case ISD::SETOLT:              // flipped
2974  case ISD::SETOGT:
2975  case ISD::SETGT:   return X86::COND_A;
2976  case ISD::SETOLE:              // flipped
2977  case ISD::SETOGE:
2978  case ISD::SETGE:   return X86::COND_AE;
2979  case ISD::SETUGT:              // flipped
2980  case ISD::SETULT:
2981  case ISD::SETLT:   return X86::COND_B;
2982  case ISD::SETUGE:              // flipped
2983  case ISD::SETULE:
2984  case ISD::SETLE:   return X86::COND_BE;
2985  case ISD::SETONE:
2986  case ISD::SETNE:   return X86::COND_NE;
2987  case ISD::SETUO:   return X86::COND_P;
2988  case ISD::SETO:    return X86::COND_NP;
2989  case ISD::SETOEQ:
2990  case ISD::SETUNE:  return X86::COND_INVALID;
2991  }
2992}
2993
2994/// hasFPCMov - is there a floating point cmov for the specific X86 condition
2995/// code. Current x86 isa includes the following FP cmov instructions:
2996/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2997static bool hasFPCMov(unsigned X86CC) {
2998  switch (X86CC) {
2999  default:
3000    return false;
3001  case X86::COND_B:
3002  case X86::COND_BE:
3003  case X86::COND_E:
3004  case X86::COND_P:
3005  case X86::COND_A:
3006  case X86::COND_AE:
3007  case X86::COND_NE:
3008  case X86::COND_NP:
3009    return true;
3010  }
3011}
3012
3013/// isFPImmLegal - Returns true if the target can instruction select the
3014/// specified FP immediate natively. If false, the legalizer will
3015/// materialize the FP immediate as a load from a constant pool.
3016bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3017  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3018    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3019      return true;
3020  }
3021  return false;
3022}
3023
3024/// isUndefOrInRange - Return true if Val is undef or if its value falls within
3025/// the specified range (L, H].
3026static bool isUndefOrInRange(int Val, int Low, int Hi) {
3027  return (Val < 0) || (Val >= Low && Val < Hi);
3028}
3029
3030/// isUndefOrInRange - Return true if every element in Mask, begining
3031/// from position Pos and ending in Pos+Size, falls within the specified
3032/// range (L, L+Pos]. or is undef.
3033static bool isUndefOrInRange(const SmallVectorImpl<int> &Mask,
3034                             int Pos, int Size, int Low, int Hi) {
3035  for (int i = Pos, e = Pos+Size; i != e; ++i)
3036    if (!isUndefOrInRange(Mask[i], Low, Hi))
3037      return false;
3038  return true;
3039}
3040
3041/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3042/// specified value.
3043static bool isUndefOrEqual(int Val, int CmpVal) {
3044  if (Val < 0 || Val == CmpVal)
3045    return true;
3046  return false;
3047}
3048
3049/// isSequentialOrUndefInRange - Return true if every element in Mask, begining
3050/// from position Pos and ending in Pos+Size, falls within the specified
3051/// sequential range (L, L+Pos]. or is undef.
3052static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask,
3053                                       int Pos, int Size, int Low) {
3054  for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3055    if (!isUndefOrEqual(Mask[i], Low))
3056      return false;
3057  return true;
3058}
3059
3060/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3061/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
3062/// the second operand.
3063static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) {
3064  if (VT == MVT::v4f32 || VT == MVT::v4i32 )
3065    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
3066  if (VT == MVT::v2f64 || VT == MVT::v2i64)
3067    return (Mask[0] < 2 && Mask[1] < 2);
3068  return false;
3069}
3070
3071bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) {
3072  SmallVector<int, 8> M;
3073  N->getMask(M);
3074  return ::isPSHUFDMask(M, N->getValueType(0));
3075}
3076
3077/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3078/// is suitable for input to PSHUFHW.
3079static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
3080  if (VT != MVT::v8i16)
3081    return false;
3082
3083  // Lower quadword copied in order or undef.
3084  for (int i = 0; i != 4; ++i)
3085    if (Mask[i] >= 0 && Mask[i] != i)
3086      return false;
3087
3088  // Upper quadword shuffled.
3089  for (int i = 4; i != 8; ++i)
3090    if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
3091      return false;
3092
3093  return true;
3094}
3095
3096bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) {
3097  SmallVector<int, 8> M;
3098  N->getMask(M);
3099  return ::isPSHUFHWMask(M, N->getValueType(0));
3100}
3101
3102/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
3103/// is suitable for input to PSHUFLW.
3104static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
3105  if (VT != MVT::v8i16)
3106    return false;
3107
3108  // Upper quadword copied in order.
3109  for (int i = 4; i != 8; ++i)
3110    if (Mask[i] >= 0 && Mask[i] != i)
3111      return false;
3112
3113  // Lower quadword shuffled.
3114  for (int i = 0; i != 4; ++i)
3115    if (Mask[i] >= 4)
3116      return false;
3117
3118  return true;
3119}
3120
3121bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
3122  SmallVector<int, 8> M;
3123  N->getMask(M);
3124  return ::isPSHUFLWMask(M, N->getValueType(0));
3125}
3126
3127/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
3128/// is suitable for input to PALIGNR.
3129static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
3130                          bool hasSSSE3) {
3131  int i, e = VT.getVectorNumElements();
3132  if (VT.getSizeInBits() != 128 && VT.getSizeInBits() != 64)
3133    return false;
3134
3135  // Do not handle v2i64 / v2f64 shuffles with palignr.
3136  if (e < 4 || !hasSSSE3)
3137    return false;
3138
3139  for (i = 0; i != e; ++i)
3140    if (Mask[i] >= 0)
3141      break;
3142
3143  // All undef, not a palignr.
3144  if (i == e)
3145    return false;
3146
3147  // Make sure we're shifting in the right direction.
3148  if (Mask[i] <= i)
3149    return false;
3150
3151  int s = Mask[i] - i;
3152
3153  // Check the rest of the elements to see if they are consecutive.
3154  for (++i; i != e; ++i) {
3155    int m = Mask[i];
3156    if (m >= 0 && m != s+i)
3157      return false;
3158  }
3159  return true;
3160}
3161
3162/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
3163/// specifies a shuffle of elements that is suitable for input to SHUFP*.
3164static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
3165  int NumElems = VT.getVectorNumElements();
3166  if (NumElems != 2 && NumElems != 4)
3167    return false;
3168
3169  int Half = NumElems / 2;
3170  for (int i = 0; i < Half; ++i)
3171    if (!isUndefOrInRange(Mask[i], 0, NumElems))
3172      return false;
3173  for (int i = Half; i < NumElems; ++i)
3174    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
3175      return false;
3176
3177  return true;
3178}
3179
3180bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
3181  SmallVector<int, 8> M;
3182  N->getMask(M);
3183  return ::isSHUFPMask(M, N->getValueType(0));
3184}
3185
3186/// isCommutedSHUFP - Returns true if the shuffle mask is exactly
3187/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
3188/// half elements to come from vector 1 (which would equal the dest.) and
3189/// the upper half to come from vector 2.
3190static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
3191  int NumElems = VT.getVectorNumElements();
3192
3193  if (NumElems != 2 && NumElems != 4)
3194    return false;
3195
3196  int Half = NumElems / 2;
3197  for (int i = 0; i < Half; ++i)
3198    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
3199      return false;
3200  for (int i = Half; i < NumElems; ++i)
3201    if (!isUndefOrInRange(Mask[i], 0, NumElems))
3202      return false;
3203  return true;
3204}
3205
3206static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
3207  SmallVector<int, 8> M;
3208  N->getMask(M);
3209  return isCommutedSHUFPMask(M, N->getValueType(0));
3210}
3211
3212/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
3213/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
3214bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
3215  EVT VT = N->getValueType(0);
3216  unsigned NumElems = VT.getVectorNumElements();
3217
3218  if (VT.getSizeInBits() != 128)
3219    return false;
3220
3221  if (NumElems != 4)
3222    return false;
3223
3224  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
3225  return isUndefOrEqual(N->getMaskElt(0), 6) &&
3226         isUndefOrEqual(N->getMaskElt(1), 7) &&
3227         isUndefOrEqual(N->getMaskElt(2), 2) &&
3228         isUndefOrEqual(N->getMaskElt(3), 3);
3229}
3230
3231/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
3232/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
3233/// <2, 3, 2, 3>
3234bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
3235  EVT VT = N->getValueType(0);
3236  unsigned NumElems = VT.getVectorNumElements();
3237
3238  if (VT.getSizeInBits() != 128)
3239    return false;
3240
3241  if (NumElems != 4)
3242    return false;
3243
3244  return isUndefOrEqual(N->getMaskElt(0), 2) &&
3245         isUndefOrEqual(N->getMaskElt(1), 3) &&
3246         isUndefOrEqual(N->getMaskElt(2), 2) &&
3247         isUndefOrEqual(N->getMaskElt(3), 3);
3248}
3249
3250/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
3251/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
3252bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
3253  unsigned NumElems = N->getValueType(0).getVectorNumElements();
3254
3255  if (NumElems != 2 && NumElems != 4)
3256    return false;
3257
3258  for (unsigned i = 0; i < NumElems/2; ++i)
3259    if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems))
3260      return false;
3261
3262  for (unsigned i = NumElems/2; i < NumElems; ++i)
3263    if (!isUndefOrEqual(N->getMaskElt(i), i))
3264      return false;
3265
3266  return true;
3267}
3268
3269/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
3270/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
3271bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
3272  unsigned NumElems = N->getValueType(0).getVectorNumElements();
3273
3274  if ((NumElems != 2 && NumElems != 4)
3275      || N->getValueType(0).getSizeInBits() > 128)
3276    return false;
3277
3278  for (unsigned i = 0; i < NumElems/2; ++i)
3279    if (!isUndefOrEqual(N->getMaskElt(i), i))
3280      return false;
3281
3282  for (unsigned i = 0; i < NumElems/2; ++i)
3283    if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems))
3284      return false;
3285
3286  return true;
3287}
3288
3289/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
3290/// specifies a shuffle of elements that is suitable for input to UNPCKL.
3291static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
3292                         bool V2IsSplat = false) {
3293  int NumElts = VT.getVectorNumElements();
3294
3295  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3296         "Unsupported vector type for unpckh");
3297
3298  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8)
3299    return false;
3300
3301  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3302  // independently on 128-bit lanes.
3303  unsigned NumLanes = VT.getSizeInBits()/128;
3304  unsigned NumLaneElts = NumElts/NumLanes;
3305
3306  unsigned Start = 0;
3307  unsigned End = NumLaneElts;
3308  for (unsigned s = 0; s < NumLanes; ++s) {
3309    for (unsigned i = Start, j = s * NumLaneElts;
3310         i != End;
3311         i += 2, ++j) {
3312      int BitI  = Mask[i];
3313      int BitI1 = Mask[i+1];
3314      if (!isUndefOrEqual(BitI, j))
3315        return false;
3316      if (V2IsSplat) {
3317        if (!isUndefOrEqual(BitI1, NumElts))
3318          return false;
3319      } else {
3320        if (!isUndefOrEqual(BitI1, j + NumElts))
3321          return false;
3322      }
3323    }
3324    // Process the next 128 bits.
3325    Start += NumLaneElts;
3326    End += NumLaneElts;
3327  }
3328
3329  return true;
3330}
3331
3332bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
3333  SmallVector<int, 8> M;
3334  N->getMask(M);
3335  return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat);
3336}
3337
3338/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
3339/// specifies a shuffle of elements that is suitable for input to UNPCKH.
3340static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
3341                         bool V2IsSplat = false) {
3342  int NumElts = VT.getVectorNumElements();
3343
3344  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3345         "Unsupported vector type for unpckh");
3346
3347  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8)
3348    return false;
3349
3350  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3351  // independently on 128-bit lanes.
3352  unsigned NumLanes = VT.getSizeInBits()/128;
3353  unsigned NumLaneElts = NumElts/NumLanes;
3354
3355  unsigned Start = 0;
3356  unsigned End = NumLaneElts;
3357  for (unsigned l = 0; l != NumLanes; ++l) {
3358    for (unsigned i = Start, j = (l*NumLaneElts)+NumLaneElts/2;
3359                             i != End; i += 2, ++j) {
3360      int BitI  = Mask[i];
3361      int BitI1 = Mask[i+1];
3362      if (!isUndefOrEqual(BitI, j))
3363        return false;
3364      if (V2IsSplat) {
3365        if (isUndefOrEqual(BitI1, NumElts))
3366          return false;
3367      } else {
3368        if (!isUndefOrEqual(BitI1, j+NumElts))
3369          return false;
3370      }
3371    }
3372    // Process the next 128 bits.
3373    Start += NumLaneElts;
3374    End += NumLaneElts;
3375  }
3376  return true;
3377}
3378
3379bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
3380  SmallVector<int, 8> M;
3381  N->getMask(M);
3382  return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat);
3383}
3384
3385/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
3386/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
3387/// <0, 0, 1, 1>
3388static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
3389  int NumElems = VT.getVectorNumElements();
3390  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
3391    return false;
3392
3393  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3394  // independently on 128-bit lanes.
3395  unsigned NumLanes = VT.getSizeInBits() / 128;
3396  unsigned NumLaneElts = NumElems / NumLanes;
3397
3398  for (unsigned s = 0; s < NumLanes; ++s) {
3399    for (unsigned i = s * NumLaneElts, j = s * NumLaneElts;
3400         i != NumLaneElts * (s + 1);
3401         i += 2, ++j) {
3402      int BitI  = Mask[i];
3403      int BitI1 = Mask[i+1];
3404
3405      if (!isUndefOrEqual(BitI, j))
3406        return false;
3407      if (!isUndefOrEqual(BitI1, j))
3408        return false;
3409    }
3410  }
3411
3412  return true;
3413}
3414
3415bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
3416  SmallVector<int, 8> M;
3417  N->getMask(M);
3418  return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0));
3419}
3420
3421/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
3422/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
3423/// <2, 2, 3, 3>
3424static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
3425  int NumElems = VT.getVectorNumElements();
3426  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
3427    return false;
3428
3429  for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
3430    int BitI  = Mask[i];
3431    int BitI1 = Mask[i+1];
3432    if (!isUndefOrEqual(BitI, j))
3433      return false;
3434    if (!isUndefOrEqual(BitI1, j))
3435      return false;
3436  }
3437  return true;
3438}
3439
3440bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
3441  SmallVector<int, 8> M;
3442  N->getMask(M);
3443  return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0));
3444}
3445
3446/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
3447/// specifies a shuffle of elements that is suitable for input to MOVSS,
3448/// MOVSD, and MOVD, i.e. setting the lowest element.
3449static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) {
3450  if (VT.getVectorElementType().getSizeInBits() < 32)
3451    return false;
3452
3453  int NumElts = VT.getVectorNumElements();
3454
3455  if (!isUndefOrEqual(Mask[0], NumElts))
3456    return false;
3457
3458  for (int i = 1; i < NumElts; ++i)
3459    if (!isUndefOrEqual(Mask[i], i))
3460      return false;
3461
3462  return true;
3463}
3464
3465bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
3466  SmallVector<int, 8> M;
3467  N->getMask(M);
3468  return ::isMOVLMask(M, N->getValueType(0));
3469}
3470
3471/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered
3472/// as permutations between 128-bit chunks or halves. As an example: this
3473/// shuffle bellow:
3474///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
3475/// The first half comes from the second half of V1 and the second half from the
3476/// the second half of V2.
3477static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
3478                             const X86Subtarget *Subtarget) {
3479  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
3480    return false;
3481
3482  // The shuffle result is divided into half A and half B. In total the two
3483  // sources have 4 halves, namely: C, D, E, F. The final values of A and
3484  // B must come from C, D, E or F.
3485  int HalfSize = VT.getVectorNumElements()/2;
3486  bool MatchA = false, MatchB = false;
3487
3488  // Check if A comes from one of C, D, E, F.
3489  for (int Half = 0; Half < 4; ++Half) {
3490    if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
3491      MatchA = true;
3492      break;
3493    }
3494  }
3495
3496  // Check if B comes from one of C, D, E, F.
3497  for (int Half = 0; Half < 4; ++Half) {
3498    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
3499      MatchB = true;
3500      break;
3501    }
3502  }
3503
3504  return MatchA && MatchB;
3505}
3506
3507/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle
3508/// the specified VECTOR_MASK mask with VPERM2F128 instructions.
3509static unsigned getShuffleVPERM2F128Immediate(SDNode *N) {
3510  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3511  EVT VT = SVOp->getValueType(0);
3512
3513  int HalfSize = VT.getVectorNumElements()/2;
3514
3515  int FstHalf = 0, SndHalf = 0;
3516  for (int i = 0; i < HalfSize; ++i) {
3517    if (SVOp->getMaskElt(i) > 0) {
3518      FstHalf = SVOp->getMaskElt(i)/HalfSize;
3519      break;
3520    }
3521  }
3522  for (int i = HalfSize; i < HalfSize*2; ++i) {
3523    if (SVOp->getMaskElt(i) > 0) {
3524      SndHalf = SVOp->getMaskElt(i)/HalfSize;
3525      break;
3526    }
3527  }
3528
3529  return (FstHalf | (SndHalf << 4));
3530}
3531
3532/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand
3533/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
3534/// Note that VPERMIL mask matching is different depending whether theunderlying
3535/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
3536/// to the same elements of the low, but to the higher half of the source.
3537/// In VPERMILPD the two lanes could be shuffled independently of each other
3538/// with the same restriction that lanes can't be crossed.
3539static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT,
3540                            const X86Subtarget *Subtarget) {
3541  int NumElts = VT.getVectorNumElements();
3542  int NumLanes = VT.getSizeInBits()/128;
3543
3544  if (!Subtarget->hasAVX())
3545    return false;
3546
3547  // Match any permutation of 128-bit vector with 64-bit types
3548  if (NumLanes == 1 && NumElts != 2)
3549    return false;
3550
3551  // Only match 256-bit with 32 types
3552  if (VT.getSizeInBits() == 256 && NumElts != 4)
3553    return false;
3554
3555  // The mask on the high lane is independent of the low. Both can match
3556  // any element in inside its own lane, but can't cross.
3557  int LaneSize = NumElts/NumLanes;
3558  for (int l = 0; l < NumLanes; ++l)
3559    for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) {
3560      int LaneStart = l*LaneSize;
3561      if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize))
3562        return false;
3563    }
3564
3565  return true;
3566}
3567
3568/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand
3569/// specifies a shuffle of elements that is suitable for input to VPERMILPS*.
3570/// Note that VPERMIL mask matching is different depending whether theunderlying
3571/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
3572/// to the same elements of the low, but to the higher half of the source.
3573/// In VPERMILPD the two lanes could be shuffled independently of each other
3574/// with the same restriction that lanes can't be crossed.
3575static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT,
3576                            const X86Subtarget *Subtarget) {
3577  unsigned NumElts = VT.getVectorNumElements();
3578  unsigned NumLanes = VT.getSizeInBits()/128;
3579
3580  if (!Subtarget->hasAVX())
3581    return false;
3582
3583  // Match any permutation of 128-bit vector with 32-bit types
3584  if (NumLanes == 1 && NumElts != 4)
3585    return false;
3586
3587  // Only match 256-bit with 32 types
3588  if (VT.getSizeInBits() == 256 && NumElts != 8)
3589    return false;
3590
3591  // The mask on the high lane should be the same as the low. Actually,
3592  // they can differ if any of the corresponding index in a lane is undef
3593  // and the other stays in range.
3594  int LaneSize = NumElts/NumLanes;
3595  for (int i = 0; i < LaneSize; ++i) {
3596    int HighElt = i+LaneSize;
3597    bool HighValid = isUndefOrInRange(Mask[HighElt], LaneSize, NumElts);
3598    bool LowValid = isUndefOrInRange(Mask[i], 0, LaneSize);
3599
3600    if (!HighValid || !LowValid)
3601      return false;
3602    if (Mask[i] < 0 || Mask[HighElt] < 0)
3603      continue;
3604    if (Mask[HighElt]-Mask[i] != LaneSize)
3605      return false;
3606  }
3607
3608  return true;
3609}
3610
3611/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle
3612/// the specified VECTOR_MASK mask with VPERMILPS* instructions.
3613static unsigned getShuffleVPERMILPSImmediate(SDNode *N) {
3614  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3615  EVT VT = SVOp->getValueType(0);
3616
3617  int NumElts = VT.getVectorNumElements();
3618  int NumLanes = VT.getSizeInBits()/128;
3619  int LaneSize = NumElts/NumLanes;
3620
3621  // Although the mask is equal for both lanes do it twice to get the cases
3622  // where a mask will match because the same mask element is undef on the
3623  // first half but valid on the second. This would get pathological cases
3624  // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid.
3625  unsigned Mask = 0;
3626  for (int l = 0; l < NumLanes; ++l) {
3627    for (int i = 0; i < LaneSize; ++i) {
3628      int MaskElt = SVOp->getMaskElt(i+(l*LaneSize));
3629      if (MaskElt < 0)
3630        continue;
3631      if (MaskElt >= LaneSize)
3632        MaskElt -= LaneSize;
3633      Mask |= MaskElt << (i*2);
3634    }
3635  }
3636
3637  return Mask;
3638}
3639
3640/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle
3641/// the specified VECTOR_MASK mask with VPERMILPD* instructions.
3642static unsigned getShuffleVPERMILPDImmediate(SDNode *N) {
3643  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3644  EVT VT = SVOp->getValueType(0);
3645
3646  int NumElts = VT.getVectorNumElements();
3647  int NumLanes = VT.getSizeInBits()/128;
3648
3649  unsigned Mask = 0;
3650  int LaneSize = NumElts/NumLanes;
3651  for (int l = 0; l < NumLanes; ++l)
3652    for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) {
3653      int MaskElt = SVOp->getMaskElt(i);
3654      if (MaskElt < 0)
3655        continue;
3656      Mask |= (MaskElt-l*LaneSize) << i;
3657    }
3658
3659  return Mask;
3660}
3661
3662/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
3663/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
3664/// element of vector 2 and the other elements to come from vector 1 in order.
3665static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
3666                               bool V2IsSplat = false, bool V2IsUndef = false) {
3667  int NumOps = VT.getVectorNumElements();
3668  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
3669    return false;
3670
3671  if (!isUndefOrEqual(Mask[0], 0))
3672    return false;
3673
3674  for (int i = 1; i < NumOps; ++i)
3675    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
3676          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
3677          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
3678      return false;
3679
3680  return true;
3681}
3682
3683static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
3684                           bool V2IsUndef = false) {
3685  SmallVector<int, 8> M;
3686  N->getMask(M);
3687  return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef);
3688}
3689
3690/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3691/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
3692/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
3693bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N,
3694                         const X86Subtarget *Subtarget) {
3695  if (!Subtarget->hasSSE3() && !Subtarget->hasAVX())
3696    return false;
3697
3698  // The second vector must be undef
3699  if (N->getOperand(1).getOpcode() != ISD::UNDEF)
3700    return false;
3701
3702  EVT VT = N->getValueType(0);
3703  unsigned NumElems = VT.getVectorNumElements();
3704
3705  if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
3706      (VT.getSizeInBits() == 256 && NumElems != 8))
3707    return false;
3708
3709  // "i+1" is the value the indexed mask element must have
3710  for (unsigned i = 0; i < NumElems; i += 2)
3711    if (!isUndefOrEqual(N->getMaskElt(i), i+1) ||
3712        !isUndefOrEqual(N->getMaskElt(i+1), i+1))
3713      return false;
3714
3715  return true;
3716}
3717
3718/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3719/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
3720/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
3721bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N,
3722                         const X86Subtarget *Subtarget) {
3723  if (!Subtarget->hasSSE3() && !Subtarget->hasAVX())
3724    return false;
3725
3726  // The second vector must be undef
3727  if (N->getOperand(1).getOpcode() != ISD::UNDEF)
3728    return false;
3729
3730  EVT VT = N->getValueType(0);
3731  unsigned NumElems = VT.getVectorNumElements();
3732
3733  if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
3734      (VT.getSizeInBits() == 256 && NumElems != 8))
3735    return false;
3736
3737  // "i" is the value the indexed mask element must have
3738  for (unsigned i = 0; i < NumElems; i += 2)
3739    if (!isUndefOrEqual(N->getMaskElt(i), i) ||
3740        !isUndefOrEqual(N->getMaskElt(i+1), i))
3741      return false;
3742
3743  return true;
3744}
3745
3746/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3747/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
3748bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
3749  int e = N->getValueType(0).getVectorNumElements() / 2;
3750
3751  for (int i = 0; i < e; ++i)
3752    if (!isUndefOrEqual(N->getMaskElt(i), i))
3753      return false;
3754  for (int i = 0; i < e; ++i)
3755    if (!isUndefOrEqual(N->getMaskElt(e+i), i))
3756      return false;
3757  return true;
3758}
3759
3760/// isVEXTRACTF128Index - Return true if the specified
3761/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
3762/// suitable for input to VEXTRACTF128.
3763bool X86::isVEXTRACTF128Index(SDNode *N) {
3764  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
3765    return false;
3766
3767  // The index should be aligned on a 128-bit boundary.
3768  uint64_t Index =
3769    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
3770
3771  unsigned VL = N->getValueType(0).getVectorNumElements();
3772  unsigned VBits = N->getValueType(0).getSizeInBits();
3773  unsigned ElSize = VBits / VL;
3774  bool Result = (Index * ElSize) % 128 == 0;
3775
3776  return Result;
3777}
3778
3779/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR
3780/// operand specifies a subvector insert that is suitable for input to
3781/// VINSERTF128.
3782bool X86::isVINSERTF128Index(SDNode *N) {
3783  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
3784    return false;
3785
3786  // The index should be aligned on a 128-bit boundary.
3787  uint64_t Index =
3788    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
3789
3790  unsigned VL = N->getValueType(0).getVectorNumElements();
3791  unsigned VBits = N->getValueType(0).getSizeInBits();
3792  unsigned ElSize = VBits / VL;
3793  bool Result = (Index * ElSize) % 128 == 0;
3794
3795  return Result;
3796}
3797
3798/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
3799/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
3800unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
3801  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3802  int NumOperands = SVOp->getValueType(0).getVectorNumElements();
3803
3804  unsigned Shift = (NumOperands == 4) ? 2 : 1;
3805  unsigned Mask = 0;
3806  for (int i = 0; i < NumOperands; ++i) {
3807    int Val = SVOp->getMaskElt(NumOperands-i-1);
3808    if (Val < 0) Val = 0;
3809    if (Val >= NumOperands) Val -= NumOperands;
3810    Mask |= Val;
3811    if (i != NumOperands - 1)
3812      Mask <<= Shift;
3813  }
3814  return Mask;
3815}
3816
3817/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
3818/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
3819unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
3820  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3821  unsigned Mask = 0;
3822  // 8 nodes, but we only care about the last 4.
3823  for (unsigned i = 7; i >= 4; --i) {
3824    int Val = SVOp->getMaskElt(i);
3825    if (Val >= 0)
3826      Mask |= (Val - 4);
3827    if (i != 4)
3828      Mask <<= 2;
3829  }
3830  return Mask;
3831}
3832
3833/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
3834/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
3835unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
3836  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3837  unsigned Mask = 0;
3838  // 8 nodes, but we only care about the first 4.
3839  for (int i = 3; i >= 0; --i) {
3840    int Val = SVOp->getMaskElt(i);
3841    if (Val >= 0)
3842      Mask |= Val;
3843    if (i != 0)
3844      Mask <<= 2;
3845  }
3846  return Mask;
3847}
3848
3849/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
3850/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
3851unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
3852  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3853  EVT VVT = N->getValueType(0);
3854  unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3;
3855  int Val = 0;
3856
3857  unsigned i, e;
3858  for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) {
3859    Val = SVOp->getMaskElt(i);
3860    if (Val >= 0)
3861      break;
3862  }
3863  assert(Val - i > 0 && "PALIGNR imm should be positive");
3864  return (Val - i) * EltSize;
3865}
3866
3867/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate
3868/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
3869/// instructions.
3870unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
3871  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
3872    llvm_unreachable("Illegal extract subvector for VEXTRACTF128");
3873
3874  uint64_t Index =
3875    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
3876
3877  EVT VecVT = N->getOperand(0).getValueType();
3878  EVT ElVT = VecVT.getVectorElementType();
3879
3880  unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
3881  return Index / NumElemsPerChunk;
3882}
3883
3884/// getInsertVINSERTF128Immediate - Return the appropriate immediate
3885/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
3886/// instructions.
3887unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
3888  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
3889    llvm_unreachable("Illegal insert subvector for VINSERTF128");
3890
3891  uint64_t Index =
3892    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
3893
3894  EVT VecVT = N->getValueType(0);
3895  EVT ElVT = VecVT.getVectorElementType();
3896
3897  unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
3898  return Index / NumElemsPerChunk;
3899}
3900
3901/// isZeroNode - Returns true if Elt is a constant zero or a floating point
3902/// constant +0.0.
3903bool X86::isZeroNode(SDValue Elt) {
3904  return ((isa<ConstantSDNode>(Elt) &&
3905           cast<ConstantSDNode>(Elt)->isNullValue()) ||
3906          (isa<ConstantFPSDNode>(Elt) &&
3907           cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
3908}
3909
3910/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
3911/// their permute mask.
3912static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
3913                                    SelectionDAG &DAG) {
3914  EVT VT = SVOp->getValueType(0);
3915  unsigned NumElems = VT.getVectorNumElements();
3916  SmallVector<int, 8> MaskVec;
3917
3918  for (unsigned i = 0; i != NumElems; ++i) {
3919    int idx = SVOp->getMaskElt(i);
3920    if (idx < 0)
3921      MaskVec.push_back(idx);
3922    else if (idx < (int)NumElems)
3923      MaskVec.push_back(idx + NumElems);
3924    else
3925      MaskVec.push_back(idx - NumElems);
3926  }
3927  return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
3928                              SVOp->getOperand(0), &MaskVec[0]);
3929}
3930
3931/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
3932/// the two vector operands have swapped position.
3933static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
3934  unsigned NumElems = VT.getVectorNumElements();
3935  for (unsigned i = 0; i != NumElems; ++i) {
3936    int idx = Mask[i];
3937    if (idx < 0)
3938      continue;
3939    else if (idx < (int)NumElems)
3940      Mask[i] = idx + NumElems;
3941    else
3942      Mask[i] = idx - NumElems;
3943  }
3944}
3945
3946/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
3947/// match movhlps. The lower half elements should come from upper half of
3948/// V1 (and in order), and the upper half elements should come from the upper
3949/// half of V2 (and in order).
3950static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
3951  EVT VT = Op->getValueType(0);
3952  if (VT.getSizeInBits() != 128)
3953    return false;
3954  if (VT.getVectorNumElements() != 4)
3955    return false;
3956  for (unsigned i = 0, e = 2; i != e; ++i)
3957    if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
3958      return false;
3959  for (unsigned i = 2; i != 4; ++i)
3960    if (!isUndefOrEqual(Op->getMaskElt(i), i+4))
3961      return false;
3962  return true;
3963}
3964
3965/// isScalarLoadToVector - Returns true if the node is a scalar load that
3966/// is promoted to a vector. It also returns the LoadSDNode by reference if
3967/// required.
3968static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
3969  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
3970    return false;
3971  N = N->getOperand(0).getNode();
3972  if (!ISD::isNON_EXTLoad(N))
3973    return false;
3974  if (LD)
3975    *LD = cast<LoadSDNode>(N);
3976  return true;
3977}
3978
3979/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
3980/// match movlp{s|d}. The lower half elements should come from lower half of
3981/// V1 (and in order), and the upper half elements should come from the upper
3982/// half of V2 (and in order). And since V1 will become the source of the
3983/// MOVLP, it must be either a vector load or a scalar load to vector.
3984static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
3985                               ShuffleVectorSDNode *Op) {
3986  EVT VT = Op->getValueType(0);
3987  if (VT.getSizeInBits() != 128)
3988    return false;
3989
3990  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
3991    return false;
3992  // Is V2 is a vector load, don't do this transformation. We will try to use
3993  // load folding shufps op.
3994  if (ISD::isNON_EXTLoad(V2))
3995    return false;
3996
3997  unsigned NumElems = VT.getVectorNumElements();
3998
3999  if (NumElems != 2 && NumElems != 4)
4000    return false;
4001  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4002    if (!isUndefOrEqual(Op->getMaskElt(i), i))
4003      return false;
4004  for (unsigned i = NumElems/2; i != NumElems; ++i)
4005    if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems))
4006      return false;
4007  return true;
4008}
4009
4010/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
4011/// all the same.
4012static bool isSplatVector(SDNode *N) {
4013  if (N->getOpcode() != ISD::BUILD_VECTOR)
4014    return false;
4015
4016  SDValue SplatValue = N->getOperand(0);
4017  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
4018    if (N->getOperand(i) != SplatValue)
4019      return false;
4020  return true;
4021}
4022
4023/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
4024/// to an zero vector.
4025/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
4026static bool isZeroShuffle(ShuffleVectorSDNode *N) {
4027  SDValue V1 = N->getOperand(0);
4028  SDValue V2 = N->getOperand(1);
4029  unsigned NumElems = N->getValueType(0).getVectorNumElements();
4030  for (unsigned i = 0; i != NumElems; ++i) {
4031    int Idx = N->getMaskElt(i);
4032    if (Idx >= (int)NumElems) {
4033      unsigned Opc = V2.getOpcode();
4034      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
4035        continue;
4036      if (Opc != ISD::BUILD_VECTOR ||
4037          !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
4038        return false;
4039    } else if (Idx >= 0) {
4040      unsigned Opc = V1.getOpcode();
4041      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
4042        continue;
4043      if (Opc != ISD::BUILD_VECTOR ||
4044          !X86::isZeroNode(V1.getOperand(Idx)))
4045        return false;
4046    }
4047  }
4048  return true;
4049}
4050
4051/// getZeroVector - Returns a vector of specified type with all zero elements.
4052///
4053static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
4054                             DebugLoc dl) {
4055  assert(VT.isVector() && "Expected a vector type");
4056
4057  // Always build SSE zero vectors as <4 x i32> bitcasted
4058  // to their dest type. This ensures they get CSE'd.
4059  SDValue Vec;
4060  if (VT.getSizeInBits() == 128) {  // SSE
4061    if (HasSSE2) {  // SSE2
4062      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
4063      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4064    } else { // SSE1
4065      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
4066      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
4067    }
4068  } else if (VT.getSizeInBits() == 256) { // AVX
4069    // 256-bit logic and arithmetic instructions in AVX are
4070    // all floating-point, no support for integer ops. Default
4071    // to emitting fp zeroed vectors then.
4072    SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
4073    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4074    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
4075  }
4076  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
4077}
4078
4079/// getOnesVector - Returns a vector of specified type with all bits set.
4080/// Always build ones vectors as <4 x i32>. For 256-bit types, use two
4081/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their
4082/// original type, ensuring they get CSE'd.
4083static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
4084  assert(VT.isVector() && "Expected a vector type");
4085  assert((VT.is128BitVector() || VT.is256BitVector())
4086         && "Expected a 128-bit or 256-bit vector type");
4087
4088  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
4089  SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
4090                            Cst, Cst, Cst, Cst);
4091
4092  if (VT.is256BitVector()) {
4093    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
4094                              Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
4095    Vec = Insert128BitVector(InsV, Vec,
4096                  DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
4097  }
4098
4099  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
4100}
4101
4102/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
4103/// that point to V2 points to its first element.
4104static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
4105  EVT VT = SVOp->getValueType(0);
4106  unsigned NumElems = VT.getVectorNumElements();
4107
4108  bool Changed = false;
4109  SmallVector<int, 8> MaskVec;
4110  SVOp->getMask(MaskVec);
4111
4112  for (unsigned i = 0; i != NumElems; ++i) {
4113    if (MaskVec[i] > (int)NumElems) {
4114      MaskVec[i] = NumElems;
4115      Changed = true;
4116    }
4117  }
4118  if (Changed)
4119    return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0),
4120                                SVOp->getOperand(1), &MaskVec[0]);
4121  return SDValue(SVOp, 0);
4122}
4123
4124/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
4125/// operation of specified width.
4126static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
4127                       SDValue V2) {
4128  unsigned NumElems = VT.getVectorNumElements();
4129  SmallVector<int, 8> Mask;
4130  Mask.push_back(NumElems);
4131  for (unsigned i = 1; i != NumElems; ++i)
4132    Mask.push_back(i);
4133  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4134}
4135
4136/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
4137static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
4138                          SDValue V2) {
4139  unsigned NumElems = VT.getVectorNumElements();
4140  SmallVector<int, 8> Mask;
4141  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
4142    Mask.push_back(i);
4143    Mask.push_back(i + NumElems);
4144  }
4145  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4146}
4147
4148/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
4149static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
4150                          SDValue V2) {
4151  unsigned NumElems = VT.getVectorNumElements();
4152  unsigned Half = NumElems/2;
4153  SmallVector<int, 8> Mask;
4154  for (unsigned i = 0; i != Half; ++i) {
4155    Mask.push_back(i + Half);
4156    Mask.push_back(i + NumElems + Half);
4157  }
4158  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4159}
4160
4161// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
4162// a generic shuffle instruction because the target has no such instructions.
4163// Generate shuffles which repeat i16 and i8 several times until they can be
4164// represented by v4f32 and then be manipulated by target suported shuffles.
4165static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
4166  EVT VT = V.getValueType();
4167  int NumElems = VT.getVectorNumElements();
4168  DebugLoc dl = V.getDebugLoc();
4169
4170  while (NumElems > 4) {
4171    if (EltNo < NumElems/2) {
4172      V = getUnpackl(DAG, dl, VT, V, V);
4173    } else {
4174      V = getUnpackh(DAG, dl, VT, V, V);
4175      EltNo -= NumElems/2;
4176    }
4177    NumElems >>= 1;
4178  }
4179  return V;
4180}
4181
4182/// getLegalSplat - Generate a legal splat with supported x86 shuffles
4183static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
4184  EVT VT = V.getValueType();
4185  DebugLoc dl = V.getDebugLoc();
4186  assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
4187         && "Vector size not supported");
4188
4189  bool Is128 = VT.getSizeInBits() == 128;
4190  EVT NVT = Is128 ? MVT::v4f32 : MVT::v8f32;
4191  V = DAG.getNode(ISD::BITCAST, dl, NVT, V);
4192
4193  if (Is128) {
4194    int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
4195    V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]);
4196  } else {
4197    // The second half of indicies refer to the higher part, which is a
4198    // duplication of the lower one. This makes this shuffle a perfect match
4199    // for the VPERM instruction.
4200    int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
4201                         EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
4202    V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]);
4203  }
4204
4205  return DAG.getNode(ISD::BITCAST, dl, VT, V);
4206}
4207
4208/// PromoteSplat - Splat is promoted to target supported vector shuffles.
4209static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
4210  EVT SrcVT = SV->getValueType(0);
4211  SDValue V1 = SV->getOperand(0);
4212  DebugLoc dl = SV->getDebugLoc();
4213
4214  int EltNo = SV->getSplatIndex();
4215  int NumElems = SrcVT.getVectorNumElements();
4216  unsigned Size = SrcVT.getSizeInBits();
4217
4218  // Extract the 128-bit part containing the splat element and update
4219  // the splat element index when it refers to the higher register.
4220  if (Size == 256) {
4221    unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0;
4222    V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl);
4223    if (Idx > 0)
4224      EltNo -= NumElems/2;
4225  }
4226
4227  // All i16 and i8 vector types can't be used directly by a generic shuffle
4228  // instruction because the target has no such instruction. Generate shuffles
4229  // which repeat i16 and i8 several times until they fit in i32, and then can
4230  // be manipulated by target suported shuffles. After the insertion of the
4231  // necessary shuffles, the result is bitcasted back to v4f32 or v8f32.
4232  EVT EltVT = SrcVT.getVectorElementType();
4233  if (NumElems > 4 && (EltVT == MVT::i8 || EltVT == MVT::i16))
4234    V1 = PromoteSplati8i16(V1, DAG, EltNo);
4235
4236  // Recreate the 256-bit vector and place the same 128-bit vector
4237  // into the low and high part. This is necessary because we want
4238  // to use VPERM to shuffle the v8f32 vector, and VPERM only shuffles
4239  // inside each separate v4f32 lane.
4240  if (Size == 256) {
4241    SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1,
4242                         DAG.getConstant(0, MVT::i32), DAG, dl);
4243    V1 = Insert128BitVector(InsV, V1,
4244               DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
4245  }
4246
4247  return getLegalSplat(DAG, V1, EltNo);
4248}
4249
4250/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
4251/// vector of zero or undef vector.  This produces a shuffle where the low
4252/// element of V2 is swizzled into the zero/undef vector, landing at element
4253/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
4254static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
4255                                             bool isZero, bool HasSSE2,
4256                                             SelectionDAG &DAG) {
4257  EVT VT = V2.getValueType();
4258  SDValue V1 = isZero
4259    ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
4260  unsigned NumElems = VT.getVectorNumElements();
4261  SmallVector<int, 16> MaskVec;
4262  for (unsigned i = 0; i != NumElems; ++i)
4263    // If this is the insertion idx, put the low elt of V2 here.
4264    MaskVec.push_back(i == Idx ? NumElems : i);
4265  return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
4266}
4267
4268/// getShuffleScalarElt - Returns the scalar element that will make up the ith
4269/// element of the result of the vector shuffle.
4270static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
4271                                   unsigned Depth) {
4272  if (Depth == 6)
4273    return SDValue();  // Limit search depth.
4274
4275  SDValue V = SDValue(N, 0);
4276  EVT VT = V.getValueType();
4277  unsigned Opcode = V.getOpcode();
4278
4279  // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
4280  if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
4281    Index = SV->getMaskElt(Index);
4282
4283    if (Index < 0)
4284      return DAG.getUNDEF(VT.getVectorElementType());
4285
4286    int NumElems = VT.getVectorNumElements();
4287    SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1);
4288    return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1);
4289  }
4290
4291  // Recurse into target specific vector shuffles to find scalars.
4292  if (isTargetShuffle(Opcode)) {
4293    int NumElems = VT.getVectorNumElements();
4294    SmallVector<unsigned, 16> ShuffleMask;
4295    SDValue ImmN;
4296
4297    switch(Opcode) {
4298    case X86ISD::SHUFPS:
4299    case X86ISD::SHUFPD:
4300      ImmN = N->getOperand(N->getNumOperands()-1);
4301      DecodeSHUFPSMask(NumElems,
4302                       cast<ConstantSDNode>(ImmN)->getZExtValue(),
4303                       ShuffleMask);
4304      break;
4305    case X86ISD::PUNPCKHBW:
4306    case X86ISD::PUNPCKHWD:
4307    case X86ISD::PUNPCKHDQ:
4308    case X86ISD::PUNPCKHQDQ:
4309      DecodePUNPCKHMask(NumElems, ShuffleMask);
4310      break;
4311    case X86ISD::UNPCKHPS:
4312    case X86ISD::UNPCKHPD:
4313    case X86ISD::VUNPCKHPSY:
4314    case X86ISD::VUNPCKHPDY:
4315      DecodeUNPCKHPMask(NumElems, ShuffleMask);
4316      break;
4317    case X86ISD::PUNPCKLBW:
4318    case X86ISD::PUNPCKLWD:
4319    case X86ISD::PUNPCKLDQ:
4320    case X86ISD::PUNPCKLQDQ:
4321      DecodePUNPCKLMask(VT, ShuffleMask);
4322      break;
4323    case X86ISD::UNPCKLPS:
4324    case X86ISD::UNPCKLPD:
4325    case X86ISD::VUNPCKLPSY:
4326    case X86ISD::VUNPCKLPDY:
4327      DecodeUNPCKLPMask(VT, ShuffleMask);
4328      break;
4329    case X86ISD::MOVHLPS:
4330      DecodeMOVHLPSMask(NumElems, ShuffleMask);
4331      break;
4332    case X86ISD::MOVLHPS:
4333      DecodeMOVLHPSMask(NumElems, ShuffleMask);
4334      break;
4335    case X86ISD::PSHUFD:
4336      ImmN = N->getOperand(N->getNumOperands()-1);
4337      DecodePSHUFMask(NumElems,
4338                      cast<ConstantSDNode>(ImmN)->getZExtValue(),
4339                      ShuffleMask);
4340      break;
4341    case X86ISD::PSHUFHW:
4342      ImmN = N->getOperand(N->getNumOperands()-1);
4343      DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(),
4344                        ShuffleMask);
4345      break;
4346    case X86ISD::PSHUFLW:
4347      ImmN = N->getOperand(N->getNumOperands()-1);
4348      DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(),
4349                        ShuffleMask);
4350      break;
4351    case X86ISD::MOVSS:
4352    case X86ISD::MOVSD: {
4353      // The index 0 always comes from the first element of the second source,
4354      // this is why MOVSS and MOVSD are used in the first place. The other
4355      // elements come from the other positions of the first source vector.
4356      unsigned OpNum = (Index == 0) ? 1 : 0;
4357      return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
4358                                 Depth+1);
4359    }
4360    case X86ISD::VPERMILPS:
4361      ImmN = N->getOperand(N->getNumOperands()-1);
4362      DecodeVPERMILPSMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
4363                        ShuffleMask);
4364      break;
4365    case X86ISD::VPERMILPSY:
4366      ImmN = N->getOperand(N->getNumOperands()-1);
4367      DecodeVPERMILPSMask(8, cast<ConstantSDNode>(ImmN)->getZExtValue(),
4368                        ShuffleMask);
4369      break;
4370    case X86ISD::VPERMILPD:
4371      ImmN = N->getOperand(N->getNumOperands()-1);
4372      DecodeVPERMILPDMask(2, cast<ConstantSDNode>(ImmN)->getZExtValue(),
4373                        ShuffleMask);
4374      break;
4375    case X86ISD::VPERMILPDY:
4376      ImmN = N->getOperand(N->getNumOperands()-1);
4377      DecodeVPERMILPDMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
4378                        ShuffleMask);
4379      break;
4380    case X86ISD::VPERM2F128:
4381      ImmN = N->getOperand(N->getNumOperands()-1);
4382      DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
4383                           ShuffleMask);
4384      break;
4385    default:
4386      assert("not implemented for target shuffle node");
4387      return SDValue();
4388    }
4389
4390    Index = ShuffleMask[Index];
4391    if (Index < 0)
4392      return DAG.getUNDEF(VT.getVectorElementType());
4393
4394    SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1);
4395    return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG,
4396                               Depth+1);
4397  }
4398
4399  // Actual nodes that may contain scalar elements
4400  if (Opcode == ISD::BITCAST) {
4401    V = V.getOperand(0);
4402    EVT SrcVT = V.getValueType();
4403    unsigned NumElems = VT.getVectorNumElements();
4404
4405    if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
4406      return SDValue();
4407  }
4408
4409  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
4410    return (Index == 0) ? V.getOperand(0)
4411                          : DAG.getUNDEF(VT.getVectorElementType());
4412
4413  if (V.getOpcode() == ISD::BUILD_VECTOR)
4414    return V.getOperand(Index);
4415
4416  return SDValue();
4417}
4418
4419/// getNumOfConsecutiveZeros - Return the number of elements of a vector
4420/// shuffle operation which come from a consecutively from a zero. The
4421/// search can start in two different directions, from left or right.
4422static
4423unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems,
4424                                  bool ZerosFromLeft, SelectionDAG &DAG) {
4425  int i = 0;
4426
4427  while (i < NumElems) {
4428    unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
4429    SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0);
4430    if (!(Elt.getNode() &&
4431         (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
4432      break;
4433    ++i;
4434  }
4435
4436  return i;
4437}
4438
4439/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to
4440/// MaskE correspond consecutively to elements from one of the vector operands,
4441/// starting from its index OpIdx. Also tell OpNum which source vector operand.
4442static
4443bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE,
4444                              int OpIdx, int NumElems, unsigned &OpNum) {
4445  bool SeenV1 = false;
4446  bool SeenV2 = false;
4447
4448  for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) {
4449    int Idx = SVOp->getMaskElt(i);
4450    // Ignore undef indicies
4451    if (Idx < 0)
4452      continue;
4453
4454    if (Idx < NumElems)
4455      SeenV1 = true;
4456    else
4457      SeenV2 = true;
4458
4459    // Only accept consecutive elements from the same vector
4460    if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
4461      return false;
4462  }
4463
4464  OpNum = SeenV1 ? 0 : 1;
4465  return true;
4466}
4467
4468/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
4469/// logical left shift of a vector.
4470static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
4471                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
4472  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
4473  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
4474              false /* check zeros from right */, DAG);
4475  unsigned OpSrc;
4476
4477  if (!NumZeros)
4478    return false;
4479
4480  // Considering the elements in the mask that are not consecutive zeros,
4481  // check if they consecutively come from only one of the source vectors.
4482  //
4483  //               V1 = {X, A, B, C}     0
4484  //                         \  \  \    /
4485  //   vector_shuffle V1, V2 <1, 2, 3, X>
4486  //
4487  if (!isShuffleMaskConsecutive(SVOp,
4488            0,                   // Mask Start Index
4489            NumElems-NumZeros-1, // Mask End Index
4490            NumZeros,            // Where to start looking in the src vector
4491            NumElems,            // Number of elements in vector
4492            OpSrc))              // Which source operand ?
4493    return false;
4494
4495  isLeft = false;
4496  ShAmt = NumZeros;
4497  ShVal = SVOp->getOperand(OpSrc);
4498  return true;
4499}
4500
4501/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
4502/// logical left shift of a vector.
4503static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
4504                              bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
4505  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
4506  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
4507              true /* check zeros from left */, DAG);
4508  unsigned OpSrc;
4509
4510  if (!NumZeros)
4511    return false;
4512
4513  // Considering the elements in the mask that are not consecutive zeros,
4514  // check if they consecutively come from only one of the source vectors.
4515  //
4516  //                           0    { A, B, X, X } = V2
4517  //                          / \    /  /
4518  //   vector_shuffle V1, V2 <X, X, 4, 5>
4519  //
4520  if (!isShuffleMaskConsecutive(SVOp,
4521            NumZeros,     // Mask Start Index
4522            NumElems-1,   // Mask End Index
4523            0,            // Where to start looking in the src vector
4524            NumElems,     // Number of elements in vector
4525            OpSrc))       // Which source operand ?
4526    return false;
4527
4528  isLeft = true;
4529  ShAmt = NumZeros;
4530  ShVal = SVOp->getOperand(OpSrc);
4531  return true;
4532}
4533
4534/// isVectorShift - Returns true if the shuffle can be implemented as a
4535/// logical left or right shift of a vector.
4536static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
4537                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
4538  if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
4539      isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
4540    return true;
4541
4542  return false;
4543}
4544
4545/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
4546///
4547static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
4548                                       unsigned NumNonZero, unsigned NumZero,
4549                                       SelectionDAG &DAG,
4550                                       const TargetLowering &TLI) {
4551  if (NumNonZero > 8)
4552    return SDValue();
4553
4554  DebugLoc dl = Op.getDebugLoc();
4555  SDValue V(0, 0);
4556  bool First = true;
4557  for (unsigned i = 0; i < 16; ++i) {
4558    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
4559    if (ThisIsNonZero && First) {
4560      if (NumZero)
4561        V = getZeroVector(MVT::v8i16, true, DAG, dl);
4562      else
4563        V = DAG.getUNDEF(MVT::v8i16);
4564      First = false;
4565    }
4566
4567    if ((i & 1) != 0) {
4568      SDValue ThisElt(0, 0), LastElt(0, 0);
4569      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
4570      if (LastIsNonZero) {
4571        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
4572                              MVT::i16, Op.getOperand(i-1));
4573      }
4574      if (ThisIsNonZero) {
4575        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
4576        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
4577                              ThisElt, DAG.getConstant(8, MVT::i8));
4578        if (LastIsNonZero)
4579          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
4580      } else
4581        ThisElt = LastElt;
4582
4583      if (ThisElt.getNode())
4584        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
4585                        DAG.getIntPtrConstant(i/2));
4586    }
4587  }
4588
4589  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
4590}
4591
4592/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
4593///
4594static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
4595                                     unsigned NumNonZero, unsigned NumZero,
4596                                     SelectionDAG &DAG,
4597                                     const TargetLowering &TLI) {
4598  if (NumNonZero > 4)
4599    return SDValue();
4600
4601  DebugLoc dl = Op.getDebugLoc();
4602  SDValue V(0, 0);
4603  bool First = true;
4604  for (unsigned i = 0; i < 8; ++i) {
4605    bool isNonZero = (NonZeros & (1 << i)) != 0;
4606    if (isNonZero) {
4607      if (First) {
4608        if (NumZero)
4609          V = getZeroVector(MVT::v8i16, true, DAG, dl);
4610        else
4611          V = DAG.getUNDEF(MVT::v8i16);
4612        First = false;
4613      }
4614      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
4615                      MVT::v8i16, V, Op.getOperand(i),
4616                      DAG.getIntPtrConstant(i));
4617    }
4618  }
4619
4620  return V;
4621}
4622
4623/// getVShift - Return a vector logical shift node.
4624///
4625static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
4626                         unsigned NumBits, SelectionDAG &DAG,
4627                         const TargetLowering &TLI, DebugLoc dl) {
4628  EVT ShVT = MVT::v2i64;
4629  unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
4630  SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
4631  return DAG.getNode(ISD::BITCAST, dl, VT,
4632                     DAG.getNode(Opc, dl, ShVT, SrcOp,
4633                             DAG.getConstant(NumBits,
4634                                  TLI.getShiftAmountTy(SrcOp.getValueType()))));
4635}
4636
4637SDValue
4638X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
4639                                          SelectionDAG &DAG) const {
4640
4641  // Check if the scalar load can be widened into a vector load. And if
4642  // the address is "base + cst" see if the cst can be "absorbed" into
4643  // the shuffle mask.
4644  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
4645    SDValue Ptr = LD->getBasePtr();
4646    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
4647      return SDValue();
4648    EVT PVT = LD->getValueType(0);
4649    if (PVT != MVT::i32 && PVT != MVT::f32)
4650      return SDValue();
4651
4652    int FI = -1;
4653    int64_t Offset = 0;
4654    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
4655      FI = FINode->getIndex();
4656      Offset = 0;
4657    } else if (DAG.isBaseWithConstantOffset(Ptr) &&
4658               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
4659      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
4660      Offset = Ptr.getConstantOperandVal(1);
4661      Ptr = Ptr.getOperand(0);
4662    } else {
4663      return SDValue();
4664    }
4665
4666    // FIXME: 256-bit vector instructions don't require a strict alignment,
4667    // improve this code to support it better.
4668    unsigned RequiredAlign = VT.getSizeInBits()/8;
4669    SDValue Chain = LD->getChain();
4670    // Make sure the stack object alignment is at least 16 or 32.
4671    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
4672    if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
4673      if (MFI->isFixedObjectIndex(FI)) {
4674        // Can't change the alignment. FIXME: It's possible to compute
4675        // the exact stack offset and reference FI + adjust offset instead.
4676        // If someone *really* cares about this. That's the way to implement it.
4677        return SDValue();
4678      } else {
4679        MFI->setObjectAlignment(FI, RequiredAlign);
4680      }
4681    }
4682
4683    // (Offset % 16 or 32) must be multiple of 4. Then address is then
4684    // Ptr + (Offset & ~15).
4685    if (Offset < 0)
4686      return SDValue();
4687    if ((Offset % RequiredAlign) & 3)
4688      return SDValue();
4689    int64_t StartOffset = Offset & ~(RequiredAlign-1);
4690    if (StartOffset)
4691      Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
4692                        Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
4693
4694    int EltNo = (Offset - StartOffset) >> 2;
4695    int NumElems = VT.getVectorNumElements();
4696
4697    EVT CanonVT = VT.getSizeInBits() == 128 ? MVT::v4i32 : MVT::v8i32;
4698    EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
4699    SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
4700                             LD->getPointerInfo().getWithOffset(StartOffset),
4701                             false, false, 0);
4702
4703    // Canonicalize it to a v4i32 or v8i32 shuffle.
4704    SmallVector<int, 8> Mask;
4705    for (int i = 0; i < NumElems; ++i)
4706      Mask.push_back(EltNo);
4707
4708    V1 = DAG.getNode(ISD::BITCAST, dl, CanonVT, V1);
4709    return DAG.getNode(ISD::BITCAST, dl, NVT,
4710                       DAG.getVectorShuffle(CanonVT, dl, V1,
4711                                            DAG.getUNDEF(CanonVT),&Mask[0]));
4712  }
4713
4714  return SDValue();
4715}
4716
4717/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
4718/// vector of type 'VT', see if the elements can be replaced by a single large
4719/// load which has the same value as a build_vector whose operands are 'elts'.
4720///
4721/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
4722///
4723/// FIXME: we'd also like to handle the case where the last elements are zero
4724/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
4725/// There's even a handy isZeroNode for that purpose.
4726static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
4727                                        DebugLoc &DL, SelectionDAG &DAG) {
4728  EVT EltVT = VT.getVectorElementType();
4729  unsigned NumElems = Elts.size();
4730
4731  LoadSDNode *LDBase = NULL;
4732  unsigned LastLoadedElt = -1U;
4733
4734  // For each element in the initializer, see if we've found a load or an undef.
4735  // If we don't find an initial load element, or later load elements are
4736  // non-consecutive, bail out.
4737  for (unsigned i = 0; i < NumElems; ++i) {
4738    SDValue Elt = Elts[i];
4739
4740    if (!Elt.getNode() ||
4741        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
4742      return SDValue();
4743    if (!LDBase) {
4744      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
4745        return SDValue();
4746      LDBase = cast<LoadSDNode>(Elt.getNode());
4747      LastLoadedElt = i;
4748      continue;
4749    }
4750    if (Elt.getOpcode() == ISD::UNDEF)
4751      continue;
4752
4753    LoadSDNode *LD = cast<LoadSDNode>(Elt);
4754    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
4755      return SDValue();
4756    LastLoadedElt = i;
4757  }
4758
4759  // If we have found an entire vector of loads and undefs, then return a large
4760  // load of the entire vector width starting at the base pointer.  If we found
4761  // consecutive loads for the low half, generate a vzext_load node.
4762  if (LastLoadedElt == NumElems - 1) {
4763    if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
4764      return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
4765                         LDBase->getPointerInfo(),
4766                         LDBase->isVolatile(), LDBase->isNonTemporal(), 0);
4767    return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
4768                       LDBase->getPointerInfo(),
4769                       LDBase->isVolatile(), LDBase->isNonTemporal(),
4770                       LDBase->getAlignment());
4771  } else if (NumElems == 4 && LastLoadedElt == 1 &&
4772             DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
4773    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
4774    SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
4775    SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys,
4776                                              Ops, 2, MVT::i32,
4777                                              LDBase->getMemOperand());
4778    return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
4779  }
4780  return SDValue();
4781}
4782
4783SDValue
4784X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
4785  DebugLoc dl = Op.getDebugLoc();
4786
4787  EVT VT = Op.getValueType();
4788  EVT ExtVT = VT.getVectorElementType();
4789  unsigned NumElems = Op.getNumOperands();
4790
4791  // Vectors containing all zeros can be matched by pxor and xorps later
4792  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
4793    // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
4794    // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
4795    if (Op.getValueType() == MVT::v4i32 ||
4796        Op.getValueType() == MVT::v8i32)
4797      return Op;
4798
4799    return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
4800  }
4801
4802  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
4803  // vectors or broken into v4i32 operations on 256-bit vectors.
4804  if (ISD::isBuildVectorAllOnes(Op.getNode())) {
4805    if (Op.getValueType() == MVT::v4i32)
4806      return Op;
4807
4808    return getOnesVector(Op.getValueType(), DAG, dl);
4809  }
4810
4811  unsigned EVTBits = ExtVT.getSizeInBits();
4812
4813  unsigned NumZero  = 0;
4814  unsigned NumNonZero = 0;
4815  unsigned NonZeros = 0;
4816  bool IsAllConstants = true;
4817  SmallSet<SDValue, 8> Values;
4818  for (unsigned i = 0; i < NumElems; ++i) {
4819    SDValue Elt = Op.getOperand(i);
4820    if (Elt.getOpcode() == ISD::UNDEF)
4821      continue;
4822    Values.insert(Elt);
4823    if (Elt.getOpcode() != ISD::Constant &&
4824        Elt.getOpcode() != ISD::ConstantFP)
4825      IsAllConstants = false;
4826    if (X86::isZeroNode(Elt))
4827      NumZero++;
4828    else {
4829      NonZeros |= (1 << i);
4830      NumNonZero++;
4831    }
4832  }
4833
4834  // All undef vector. Return an UNDEF.  All zero vectors were handled above.
4835  if (NumNonZero == 0)
4836    return DAG.getUNDEF(VT);
4837
4838  // Special case for single non-zero, non-undef, element.
4839  if (NumNonZero == 1) {
4840    unsigned Idx = CountTrailingZeros_32(NonZeros);
4841    SDValue Item = Op.getOperand(Idx);
4842
4843    // If this is an insertion of an i64 value on x86-32, and if the top bits of
4844    // the value are obviously zero, truncate the value to i32 and do the
4845    // insertion that way.  Only do this if the value is non-constant or if the
4846    // value is a constant being inserted into element 0.  It is cheaper to do
4847    // a constant pool load than it is to do a movd + shuffle.
4848    if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
4849        (!IsAllConstants || Idx == 0)) {
4850      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
4851        // Handle SSE only.
4852        assert(VT == MVT::v2i64 && "Expected an SSE value type!");
4853        EVT VecVT = MVT::v4i32;
4854        unsigned VecElts = 4;
4855
4856        // Truncate the value (which may itself be a constant) to i32, and
4857        // convert it to a vector with movd (S2V+shuffle to zero extend).
4858        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
4859        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
4860        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
4861                                           Subtarget->hasSSE2(), DAG);
4862
4863        // Now we have our 32-bit value zero extended in the low element of
4864        // a vector.  If Idx != 0, swizzle it into place.
4865        if (Idx != 0) {
4866          SmallVector<int, 4> Mask;
4867          Mask.push_back(Idx);
4868          for (unsigned i = 1; i != VecElts; ++i)
4869            Mask.push_back(i);
4870          Item = DAG.getVectorShuffle(VecVT, dl, Item,
4871                                      DAG.getUNDEF(Item.getValueType()),
4872                                      &Mask[0]);
4873        }
4874        return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item);
4875      }
4876    }
4877
4878    // If we have a constant or non-constant insertion into the low element of
4879    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
4880    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
4881    // depending on what the source datatype is.
4882    if (Idx == 0) {
4883      if (NumZero == 0) {
4884        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
4885      } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
4886          (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
4887        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
4888        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
4889        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
4890                                           DAG);
4891      } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
4892        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
4893        assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
4894        EVT MiddleVT = MVT::v4i32;
4895        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
4896        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
4897                                           Subtarget->hasSSE2(), DAG);
4898        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
4899      }
4900    }
4901
4902    // Is it a vector logical left shift?
4903    if (NumElems == 2 && Idx == 1 &&
4904        X86::isZeroNode(Op.getOperand(0)) &&
4905        !X86::isZeroNode(Op.getOperand(1))) {
4906      unsigned NumBits = VT.getSizeInBits();
4907      return getVShift(true, VT,
4908                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
4909                                   VT, Op.getOperand(1)),
4910                       NumBits/2, DAG, *this, dl);
4911    }
4912
4913    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
4914      return SDValue();
4915
4916    // Otherwise, if this is a vector with i32 or f32 elements, and the element
4917    // is a non-constant being inserted into an element other than the low one,
4918    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
4919    // movd/movss) to move this into the low element, then shuffle it into
4920    // place.
4921    if (EVTBits == 32) {
4922      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
4923
4924      // Turn it into a shuffle of zero and zero-extended scalar to vector.
4925      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
4926                                         Subtarget->hasSSE2(), DAG);
4927      SmallVector<int, 8> MaskVec;
4928      for (unsigned i = 0; i < NumElems; i++)
4929        MaskVec.push_back(i == Idx ? 0 : 1);
4930      return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
4931    }
4932  }
4933
4934  // Splat is obviously ok. Let legalizer expand it to a shuffle.
4935  if (Values.size() == 1) {
4936    if (EVTBits == 32) {
4937      // Instead of a shuffle like this:
4938      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
4939      // Check if it's possible to issue this instead.
4940      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
4941      unsigned Idx = CountTrailingZeros_32(NonZeros);
4942      SDValue Item = Op.getOperand(Idx);
4943      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
4944        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
4945    }
4946    return SDValue();
4947  }
4948
4949  // A vector full of immediates; various special cases are already
4950  // handled, so this is best done with a single constant-pool load.
4951  if (IsAllConstants)
4952    return SDValue();
4953
4954  // For AVX-length vectors, build the individual 128-bit pieces and use
4955  // shuffles to put them in place.
4956  if (VT.getSizeInBits() == 256 && !ISD::isBuildVectorAllZeros(Op.getNode())) {
4957    SmallVector<SDValue, 32> V;
4958    for (unsigned i = 0; i < NumElems; ++i)
4959      V.push_back(Op.getOperand(i));
4960
4961    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
4962
4963    // Build both the lower and upper subvector.
4964    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
4965    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
4966                                NumElems/2);
4967
4968    // Recreate the wider vector with the lower and upper part.
4969    SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower,
4970                                DAG.getConstant(0, MVT::i32), DAG, dl);
4971    return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32),
4972                              DAG, dl);
4973  }
4974
4975  // Let legalizer expand 2-wide build_vectors.
4976  if (EVTBits == 64) {
4977    if (NumNonZero == 1) {
4978      // One half is zero or undef.
4979      unsigned Idx = CountTrailingZeros_32(NonZeros);
4980      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
4981                                 Op.getOperand(Idx));
4982      return getShuffleVectorZeroOrUndef(V2, Idx, true,
4983                                         Subtarget->hasSSE2(), DAG);
4984    }
4985    return SDValue();
4986  }
4987
4988  // If element VT is < 32 bits, convert it to inserts into a zero vector.
4989  if (EVTBits == 8 && NumElems == 16) {
4990    SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
4991                                        *this);
4992    if (V.getNode()) return V;
4993  }
4994
4995  if (EVTBits == 16 && NumElems == 8) {
4996    SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
4997                                      *this);
4998    if (V.getNode()) return V;
4999  }
5000
5001  // If element VT is == 32 bits, turn it into a number of shuffles.
5002  SmallVector<SDValue, 8> V;
5003  V.resize(NumElems);
5004  if (NumElems == 4 && NumZero > 0) {
5005    for (unsigned i = 0; i < 4; ++i) {
5006      bool isZero = !(NonZeros & (1 << i));
5007      if (isZero)
5008        V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
5009      else
5010        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
5011    }
5012
5013    for (unsigned i = 0; i < 2; ++i) {
5014      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
5015        default: break;
5016        case 0:
5017          V[i] = V[i*2];  // Must be a zero vector.
5018          break;
5019        case 1:
5020          V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
5021          break;
5022        case 2:
5023          V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
5024          break;
5025        case 3:
5026          V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
5027          break;
5028      }
5029    }
5030
5031    SmallVector<int, 8> MaskVec;
5032    bool Reverse = (NonZeros & 0x3) == 2;
5033    for (unsigned i = 0; i < 2; ++i)
5034      MaskVec.push_back(Reverse ? 1-i : i);
5035    Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
5036    for (unsigned i = 0; i < 2; ++i)
5037      MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems);
5038    return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
5039  }
5040
5041  if (Values.size() > 1 && VT.getSizeInBits() == 128) {
5042    // Check for a build vector of consecutive loads.
5043    for (unsigned i = 0; i < NumElems; ++i)
5044      V[i] = Op.getOperand(i);
5045
5046    // Check for elements which are consecutive loads.
5047    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
5048    if (LD.getNode())
5049      return LD;
5050
5051    // For SSE 4.1, use insertps to put the high elements into the low element.
5052    if (getSubtarget()->hasSSE41()) {
5053      SDValue Result;
5054      if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
5055        Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
5056      else
5057        Result = DAG.getUNDEF(VT);
5058
5059      for (unsigned i = 1; i < NumElems; ++i) {
5060        if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
5061        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
5062                             Op.getOperand(i), DAG.getIntPtrConstant(i));
5063      }
5064      return Result;
5065    }
5066
5067    // Otherwise, expand into a number of unpckl*, start by extending each of
5068    // our (non-undef) elements to the full vector width with the element in the
5069    // bottom slot of the vector (which generates no code for SSE).
5070    for (unsigned i = 0; i < NumElems; ++i) {
5071      if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
5072        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
5073      else
5074        V[i] = DAG.getUNDEF(VT);
5075    }
5076
5077    // Next, we iteratively mix elements, e.g. for v4f32:
5078    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
5079    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
5080    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
5081    unsigned EltStride = NumElems >> 1;
5082    while (EltStride != 0) {
5083      for (unsigned i = 0; i < EltStride; ++i) {
5084        // If V[i+EltStride] is undef and this is the first round of mixing,
5085        // then it is safe to just drop this shuffle: V[i] is already in the
5086        // right place, the one element (since it's the first round) being
5087        // inserted as undef can be dropped.  This isn't safe for successive
5088        // rounds because they will permute elements within both vectors.
5089        if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
5090            EltStride == NumElems/2)
5091          continue;
5092
5093        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
5094      }
5095      EltStride >>= 1;
5096    }
5097    return V[0];
5098  }
5099  return SDValue();
5100}
5101
5102// LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place
5103// them in a MMX register.  This is better than doing a stack convert.
5104static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
5105  DebugLoc dl = Op.getDebugLoc();
5106  EVT ResVT = Op.getValueType();
5107
5108  assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
5109         ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
5110  int Mask[2];
5111  SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0));
5112  SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
5113  InVec = Op.getOperand(1);
5114  if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
5115    unsigned NumElts = ResVT.getVectorNumElements();
5116    VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
5117    VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
5118                       InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
5119  } else {
5120    InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec);
5121    SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
5122    Mask[0] = 0; Mask[1] = 2;
5123    VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
5124  }
5125  return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
5126}
5127
5128// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
5129// to create 256-bit vectors from two other 128-bit ones.
5130static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
5131  DebugLoc dl = Op.getDebugLoc();
5132  EVT ResVT = Op.getValueType();
5133
5134  assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide");
5135
5136  SDValue V1 = Op.getOperand(0);
5137  SDValue V2 = Op.getOperand(1);
5138  unsigned NumElems = ResVT.getVectorNumElements();
5139
5140  SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1,
5141                                 DAG.getConstant(0, MVT::i32), DAG, dl);
5142  return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
5143                            DAG, dl);
5144}
5145
5146SDValue
5147X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
5148  EVT ResVT = Op.getValueType();
5149
5150  assert(Op.getNumOperands() == 2);
5151  assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) &&
5152         "Unsupported CONCAT_VECTORS for value type");
5153
5154  // We support concatenate two MMX registers and place them in a MMX register.
5155  // This is better than doing a stack convert.
5156  if (ResVT.is128BitVector())
5157    return LowerMMXCONCAT_VECTORS(Op, DAG);
5158
5159  // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
5160  // from two other 128-bit ones.
5161  return LowerAVXCONCAT_VECTORS(Op, DAG);
5162}
5163
5164// v8i16 shuffles - Prefer shuffles in the following order:
5165// 1. [all]   pshuflw, pshufhw, optional move
5166// 2. [ssse3] 1 x pshufb
5167// 3. [ssse3] 2 x pshufb + 1 x por
5168// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
5169SDValue
5170X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
5171                                            SelectionDAG &DAG) const {
5172  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5173  SDValue V1 = SVOp->getOperand(0);
5174  SDValue V2 = SVOp->getOperand(1);
5175  DebugLoc dl = SVOp->getDebugLoc();
5176  SmallVector<int, 8> MaskVals;
5177
5178  // Determine if more than 1 of the words in each of the low and high quadwords
5179  // of the result come from the same quadword of one of the two inputs.  Undef
5180  // mask values count as coming from any quadword, for better codegen.
5181  SmallVector<unsigned, 4> LoQuad(4);
5182  SmallVector<unsigned, 4> HiQuad(4);
5183  BitVector InputQuads(4);
5184  for (unsigned i = 0; i < 8; ++i) {
5185    SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
5186    int EltIdx = SVOp->getMaskElt(i);
5187    MaskVals.push_back(EltIdx);
5188    if (EltIdx < 0) {
5189      ++Quad[0];
5190      ++Quad[1];
5191      ++Quad[2];
5192      ++Quad[3];
5193      continue;
5194    }
5195    ++Quad[EltIdx / 4];
5196    InputQuads.set(EltIdx / 4);
5197  }
5198
5199  int BestLoQuad = -1;
5200  unsigned MaxQuad = 1;
5201  for (unsigned i = 0; i < 4; ++i) {
5202    if (LoQuad[i] > MaxQuad) {
5203      BestLoQuad = i;
5204      MaxQuad = LoQuad[i];
5205    }
5206  }
5207
5208  int BestHiQuad = -1;
5209  MaxQuad = 1;
5210  for (unsigned i = 0; i < 4; ++i) {
5211    if (HiQuad[i] > MaxQuad) {
5212      BestHiQuad = i;
5213      MaxQuad = HiQuad[i];
5214    }
5215  }
5216
5217  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
5218  // of the two input vectors, shuffle them into one input vector so only a
5219  // single pshufb instruction is necessary. If There are more than 2 input
5220  // quads, disable the next transformation since it does not help SSSE3.
5221  bool V1Used = InputQuads[0] || InputQuads[1];
5222  bool V2Used = InputQuads[2] || InputQuads[3];
5223  if (Subtarget->hasSSSE3()) {
5224    if (InputQuads.count() == 2 && V1Used && V2Used) {
5225      BestLoQuad = InputQuads.find_first();
5226      BestHiQuad = InputQuads.find_next(BestLoQuad);
5227    }
5228    if (InputQuads.count() > 2) {
5229      BestLoQuad = -1;
5230      BestHiQuad = -1;
5231    }
5232  }
5233
5234  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
5235  // the shuffle mask.  If a quad is scored as -1, that means that it contains
5236  // words from all 4 input quadwords.
5237  SDValue NewV;
5238  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
5239    SmallVector<int, 8> MaskV;
5240    MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
5241    MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
5242    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
5243                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
5244                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
5245    NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
5246
5247    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
5248    // source words for the shuffle, to aid later transformations.
5249    bool AllWordsInNewV = true;
5250    bool InOrder[2] = { true, true };
5251    for (unsigned i = 0; i != 8; ++i) {
5252      int idx = MaskVals[i];
5253      if (idx != (int)i)
5254        InOrder[i/4] = false;
5255      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
5256        continue;
5257      AllWordsInNewV = false;
5258      break;
5259    }
5260
5261    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
5262    if (AllWordsInNewV) {
5263      for (int i = 0; i != 8; ++i) {
5264        int idx = MaskVals[i];
5265        if (idx < 0)
5266          continue;
5267        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
5268        if ((idx != i) && idx < 4)
5269          pshufhw = false;
5270        if ((idx != i) && idx > 3)
5271          pshuflw = false;
5272      }
5273      V1 = NewV;
5274      V2Used = false;
5275      BestLoQuad = 0;
5276      BestHiQuad = 1;
5277    }
5278
5279    // If we've eliminated the use of V2, and the new mask is a pshuflw or
5280    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
5281    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
5282      unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
5283      unsigned TargetMask = 0;
5284      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
5285                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
5286      TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()):
5287                             X86::getShufflePSHUFLWImmediate(NewV.getNode());
5288      V1 = NewV.getOperand(0);
5289      return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
5290    }
5291  }
5292
5293  // If we have SSSE3, and all words of the result are from 1 input vector,
5294  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
5295  // is present, fall back to case 4.
5296  if (Subtarget->hasSSSE3()) {
5297    SmallVector<SDValue,16> pshufbMask;
5298
5299    // If we have elements from both input vectors, set the high bit of the
5300    // shuffle mask element to zero out elements that come from V2 in the V1
5301    // mask, and elements that come from V1 in the V2 mask, so that the two
5302    // results can be OR'd together.
5303    bool TwoInputs = V1Used && V2Used;
5304    for (unsigned i = 0; i != 8; ++i) {
5305      int EltIdx = MaskVals[i] * 2;
5306      if (TwoInputs && (EltIdx >= 16)) {
5307        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
5308        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
5309        continue;
5310      }
5311      pshufbMask.push_back(DAG.getConstant(EltIdx,   MVT::i8));
5312      pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
5313    }
5314    V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
5315    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
5316                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5317                                 MVT::v16i8, &pshufbMask[0], 16));
5318    if (!TwoInputs)
5319      return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
5320
5321    // Calculate the shuffle mask for the second input, shuffle it, and
5322    // OR it with the first shuffled input.
5323    pshufbMask.clear();
5324    for (unsigned i = 0; i != 8; ++i) {
5325      int EltIdx = MaskVals[i] * 2;
5326      if (EltIdx < 16) {
5327        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
5328        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
5329        continue;
5330      }
5331      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
5332      pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
5333    }
5334    V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
5335    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
5336                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5337                                 MVT::v16i8, &pshufbMask[0], 16));
5338    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
5339    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
5340  }
5341
5342  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
5343  // and update MaskVals with new element order.
5344  BitVector InOrder(8);
5345  if (BestLoQuad >= 0) {
5346    SmallVector<int, 8> MaskV;
5347    for (int i = 0; i != 4; ++i) {
5348      int idx = MaskVals[i];
5349      if (idx < 0) {
5350        MaskV.push_back(-1);
5351        InOrder.set(i);
5352      } else if ((idx / 4) == BestLoQuad) {
5353        MaskV.push_back(idx & 3);
5354        InOrder.set(i);
5355      } else {
5356        MaskV.push_back(-1);
5357      }
5358    }
5359    for (unsigned i = 4; i != 8; ++i)
5360      MaskV.push_back(i);
5361    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
5362                                &MaskV[0]);
5363
5364    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
5365      NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
5366                               NewV.getOperand(0),
5367                               X86::getShufflePSHUFLWImmediate(NewV.getNode()),
5368                               DAG);
5369  }
5370
5371  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
5372  // and update MaskVals with the new element order.
5373  if (BestHiQuad >= 0) {
5374    SmallVector<int, 8> MaskV;
5375    for (unsigned i = 0; i != 4; ++i)
5376      MaskV.push_back(i);
5377    for (unsigned i = 4; i != 8; ++i) {
5378      int idx = MaskVals[i];
5379      if (idx < 0) {
5380        MaskV.push_back(-1);
5381        InOrder.set(i);
5382      } else if ((idx / 4) == BestHiQuad) {
5383        MaskV.push_back((idx & 3) + 4);
5384        InOrder.set(i);
5385      } else {
5386        MaskV.push_back(-1);
5387      }
5388    }
5389    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
5390                                &MaskV[0]);
5391
5392    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
5393      NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
5394                              NewV.getOperand(0),
5395                              X86::getShufflePSHUFHWImmediate(NewV.getNode()),
5396                              DAG);
5397  }
5398
5399  // In case BestHi & BestLo were both -1, which means each quadword has a word
5400  // from each of the four input quadwords, calculate the InOrder bitvector now
5401  // before falling through to the insert/extract cleanup.
5402  if (BestLoQuad == -1 && BestHiQuad == -1) {
5403    NewV = V1;
5404    for (int i = 0; i != 8; ++i)
5405      if (MaskVals[i] < 0 || MaskVals[i] == i)
5406        InOrder.set(i);
5407  }
5408
5409  // The other elements are put in the right place using pextrw and pinsrw.
5410  for (unsigned i = 0; i != 8; ++i) {
5411    if (InOrder[i])
5412      continue;
5413    int EltIdx = MaskVals[i];
5414    if (EltIdx < 0)
5415      continue;
5416    SDValue ExtOp = (EltIdx < 8)
5417    ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
5418                  DAG.getIntPtrConstant(EltIdx))
5419    : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
5420                  DAG.getIntPtrConstant(EltIdx - 8));
5421    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
5422                       DAG.getIntPtrConstant(i));
5423  }
5424  return NewV;
5425}
5426
5427// v16i8 shuffles - Prefer shuffles in the following order:
5428// 1. [ssse3] 1 x pshufb
5429// 2. [ssse3] 2 x pshufb + 1 x por
5430// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
5431static
5432SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
5433                                 SelectionDAG &DAG,
5434                                 const X86TargetLowering &TLI) {
5435  SDValue V1 = SVOp->getOperand(0);
5436  SDValue V2 = SVOp->getOperand(1);
5437  DebugLoc dl = SVOp->getDebugLoc();
5438  SmallVector<int, 16> MaskVals;
5439  SVOp->getMask(MaskVals);
5440
5441  // If we have SSSE3, case 1 is generated when all result bytes come from
5442  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
5443  // present, fall back to case 3.
5444  // FIXME: kill V2Only once shuffles are canonizalized by getNode.
5445  bool V1Only = true;
5446  bool V2Only = true;
5447  for (unsigned i = 0; i < 16; ++i) {
5448    int EltIdx = MaskVals[i];
5449    if (EltIdx < 0)
5450      continue;
5451    if (EltIdx < 16)
5452      V2Only = false;
5453    else
5454      V1Only = false;
5455  }
5456
5457  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
5458  if (TLI.getSubtarget()->hasSSSE3()) {
5459    SmallVector<SDValue,16> pshufbMask;
5460
5461    // If all result elements are from one input vector, then only translate
5462    // undef mask values to 0x80 (zero out result) in the pshufb mask.
5463    //
5464    // Otherwise, we have elements from both input vectors, and must zero out
5465    // elements that come from V2 in the first mask, and V1 in the second mask
5466    // so that we can OR them together.
5467    bool TwoInputs = !(V1Only || V2Only);
5468    for (unsigned i = 0; i != 16; ++i) {
5469      int EltIdx = MaskVals[i];
5470      if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
5471        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
5472        continue;
5473      }
5474      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
5475    }
5476    // If all the elements are from V2, assign it to V1 and return after
5477    // building the first pshufb.
5478    if (V2Only)
5479      V1 = V2;
5480    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
5481                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5482                                 MVT::v16i8, &pshufbMask[0], 16));
5483    if (!TwoInputs)
5484      return V1;
5485
5486    // Calculate the shuffle mask for the second input, shuffle it, and
5487    // OR it with the first shuffled input.
5488    pshufbMask.clear();
5489    for (unsigned i = 0; i != 16; ++i) {
5490      int EltIdx = MaskVals[i];
5491      if (EltIdx < 16) {
5492        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
5493        continue;
5494      }
5495      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
5496    }
5497    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
5498                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5499                                 MVT::v16i8, &pshufbMask[0], 16));
5500    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
5501  }
5502
5503  // No SSSE3 - Calculate in place words and then fix all out of place words
5504  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
5505  // the 16 different words that comprise the two doublequadword input vectors.
5506  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
5507  V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
5508  SDValue NewV = V2Only ? V2 : V1;
5509  for (int i = 0; i != 8; ++i) {
5510    int Elt0 = MaskVals[i*2];
5511    int Elt1 = MaskVals[i*2+1];
5512
5513    // This word of the result is all undef, skip it.
5514    if (Elt0 < 0 && Elt1 < 0)
5515      continue;
5516
5517    // This word of the result is already in the correct place, skip it.
5518    if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
5519      continue;
5520    if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
5521      continue;
5522
5523    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
5524    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
5525    SDValue InsElt;
5526
5527    // If Elt0 and Elt1 are defined, are consecutive, and can be load
5528    // using a single extract together, load it and store it.
5529    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
5530      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
5531                           DAG.getIntPtrConstant(Elt1 / 2));
5532      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
5533                        DAG.getIntPtrConstant(i));
5534      continue;
5535    }
5536
5537    // If Elt1 is defined, extract it from the appropriate source.  If the
5538    // source byte is not also odd, shift the extracted word left 8 bits
5539    // otherwise clear the bottom 8 bits if we need to do an or.
5540    if (Elt1 >= 0) {
5541      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
5542                           DAG.getIntPtrConstant(Elt1 / 2));
5543      if ((Elt1 & 1) == 0)
5544        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
5545                             DAG.getConstant(8,
5546                                  TLI.getShiftAmountTy(InsElt.getValueType())));
5547      else if (Elt0 >= 0)
5548        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
5549                             DAG.getConstant(0xFF00, MVT::i16));
5550    }
5551    // If Elt0 is defined, extract it from the appropriate source.  If the
5552    // source byte is not also even, shift the extracted word right 8 bits. If
5553    // Elt1 was also defined, OR the extracted values together before
5554    // inserting them in the result.
5555    if (Elt0 >= 0) {
5556      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
5557                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
5558      if ((Elt0 & 1) != 0)
5559        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
5560                              DAG.getConstant(8,
5561                                 TLI.getShiftAmountTy(InsElt0.getValueType())));
5562      else if (Elt1 >= 0)
5563        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
5564                             DAG.getConstant(0x00FF, MVT::i16));
5565      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
5566                         : InsElt0;
5567    }
5568    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
5569                       DAG.getIntPtrConstant(i));
5570  }
5571  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
5572}
5573
5574/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
5575/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
5576/// done when every pair / quad of shuffle mask elements point to elements in
5577/// the right sequence. e.g.
5578/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
5579static
5580SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
5581                                 SelectionDAG &DAG, DebugLoc dl) {
5582  EVT VT = SVOp->getValueType(0);
5583  SDValue V1 = SVOp->getOperand(0);
5584  SDValue V2 = SVOp->getOperand(1);
5585  unsigned NumElems = VT.getVectorNumElements();
5586  unsigned NewWidth = (NumElems == 4) ? 2 : 4;
5587  EVT NewVT;
5588  switch (VT.getSimpleVT().SimpleTy) {
5589  default: assert(false && "Unexpected!");
5590  case MVT::v4f32: NewVT = MVT::v2f64; break;
5591  case MVT::v4i32: NewVT = MVT::v2i64; break;
5592  case MVT::v8i16: NewVT = MVT::v4i32; break;
5593  case MVT::v16i8: NewVT = MVT::v4i32; break;
5594  }
5595
5596  int Scale = NumElems / NewWidth;
5597  SmallVector<int, 8> MaskVec;
5598  for (unsigned i = 0; i < NumElems; i += Scale) {
5599    int StartIdx = -1;
5600    for (int j = 0; j < Scale; ++j) {
5601      int EltIdx = SVOp->getMaskElt(i+j);
5602      if (EltIdx < 0)
5603        continue;
5604      if (StartIdx == -1)
5605        StartIdx = EltIdx - (EltIdx % Scale);
5606      if (EltIdx != StartIdx + j)
5607        return SDValue();
5608    }
5609    if (StartIdx == -1)
5610      MaskVec.push_back(-1);
5611    else
5612      MaskVec.push_back(StartIdx / Scale);
5613  }
5614
5615  V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
5616  V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
5617  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
5618}
5619
5620/// getVZextMovL - Return a zero-extending vector move low node.
5621///
5622static SDValue getVZextMovL(EVT VT, EVT OpVT,
5623                            SDValue SrcOp, SelectionDAG &DAG,
5624                            const X86Subtarget *Subtarget, DebugLoc dl) {
5625  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
5626    LoadSDNode *LD = NULL;
5627    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
5628      LD = dyn_cast<LoadSDNode>(SrcOp);
5629    if (!LD) {
5630      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
5631      // instead.
5632      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
5633      if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
5634          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5635          SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
5636          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
5637        // PR2108
5638        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
5639        return DAG.getNode(ISD::BITCAST, dl, VT,
5640                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
5641                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
5642                                                   OpVT,
5643                                                   SrcOp.getOperand(0)
5644                                                          .getOperand(0))));
5645      }
5646    }
5647  }
5648
5649  return DAG.getNode(ISD::BITCAST, dl, VT,
5650                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
5651                                 DAG.getNode(ISD::BITCAST, dl,
5652                                             OpVT, SrcOp)));
5653}
5654
5655/// areShuffleHalvesWithinDisjointLanes - Check whether each half of a vector
5656/// shuffle node referes to only one lane in the sources.
5657static bool areShuffleHalvesWithinDisjointLanes(ShuffleVectorSDNode *SVOp) {
5658  EVT VT = SVOp->getValueType(0);
5659  int NumElems = VT.getVectorNumElements();
5660  int HalfSize = NumElems/2;
5661  SmallVector<int, 16> M;
5662  SVOp->getMask(M);
5663  bool MatchA = false, MatchB = false;
5664
5665  for (int l = 0; l < NumElems*2; l += HalfSize) {
5666    if (isUndefOrInRange(M, 0, HalfSize, l, l+HalfSize)) {
5667      MatchA = true;
5668      break;
5669    }
5670  }
5671
5672  for (int l = 0; l < NumElems*2; l += HalfSize) {
5673    if (isUndefOrInRange(M, HalfSize, HalfSize, l, l+HalfSize)) {
5674      MatchB = true;
5675      break;
5676    }
5677  }
5678
5679  return MatchA && MatchB;
5680}
5681
5682/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
5683/// which could not be matched by any known target speficic shuffle
5684static SDValue
5685LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
5686  if (areShuffleHalvesWithinDisjointLanes(SVOp)) {
5687    // If each half of a vector shuffle node referes to only one lane in the
5688    // source vectors, extract each used 128-bit lane and shuffle them using
5689    // 128-bit shuffles. Then, concatenate the results. Otherwise leave
5690    // the work to the legalizer.
5691    DebugLoc dl = SVOp->getDebugLoc();
5692    EVT VT = SVOp->getValueType(0);
5693    int NumElems = VT.getVectorNumElements();
5694    int HalfSize = NumElems/2;
5695
5696    // Extract the reference for each half
5697    int FstVecExtractIdx = 0, SndVecExtractIdx = 0;
5698    int FstVecOpNum = 0, SndVecOpNum = 0;
5699    for (int i = 0; i < HalfSize; ++i) {
5700      int Elt = SVOp->getMaskElt(i);
5701      if (SVOp->getMaskElt(i) < 0)
5702        continue;
5703      FstVecOpNum = Elt/NumElems;
5704      FstVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
5705      break;
5706    }
5707    for (int i = HalfSize; i < NumElems; ++i) {
5708      int Elt = SVOp->getMaskElt(i);
5709      if (SVOp->getMaskElt(i) < 0)
5710        continue;
5711      SndVecOpNum = Elt/NumElems;
5712      SndVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
5713      break;
5714    }
5715
5716    // Extract the subvectors
5717    SDValue V1 = Extract128BitVector(SVOp->getOperand(FstVecOpNum),
5718                      DAG.getConstant(FstVecExtractIdx, MVT::i32), DAG, dl);
5719    SDValue V2 = Extract128BitVector(SVOp->getOperand(SndVecOpNum),
5720                      DAG.getConstant(SndVecExtractIdx, MVT::i32), DAG, dl);
5721
5722    // Generate 128-bit shuffles
5723    SmallVector<int, 16> MaskV1, MaskV2;
5724    for (int i = 0; i < HalfSize; ++i) {
5725      int Elt = SVOp->getMaskElt(i);
5726      MaskV1.push_back(Elt < 0 ? Elt : Elt % HalfSize);
5727    }
5728    for (int i = HalfSize; i < NumElems; ++i) {
5729      int Elt = SVOp->getMaskElt(i);
5730      MaskV2.push_back(Elt < 0 ? Elt : Elt % HalfSize);
5731    }
5732
5733    EVT NVT = V1.getValueType();
5734    V1 = DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &MaskV1[0]);
5735    V2 = DAG.getVectorShuffle(NVT, dl, V2, DAG.getUNDEF(NVT), &MaskV2[0]);
5736
5737    // Concatenate the result back
5738    SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), V1,
5739                                   DAG.getConstant(0, MVT::i32), DAG, dl);
5740    return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
5741                              DAG, dl);
5742  }
5743
5744  return SDValue();
5745}
5746
5747/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
5748/// 4 elements, and match them with several different shuffle types.
5749static SDValue
5750LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
5751  SDValue V1 = SVOp->getOperand(0);
5752  SDValue V2 = SVOp->getOperand(1);
5753  DebugLoc dl = SVOp->getDebugLoc();
5754  EVT VT = SVOp->getValueType(0);
5755
5756  assert(VT.getSizeInBits() == 128 && "Unsupported vector size");
5757
5758  SmallVector<std::pair<int, int>, 8> Locs;
5759  Locs.resize(4);
5760  SmallVector<int, 8> Mask1(4U, -1);
5761  SmallVector<int, 8> PermMask;
5762  SVOp->getMask(PermMask);
5763
5764  unsigned NumHi = 0;
5765  unsigned NumLo = 0;
5766  for (unsigned i = 0; i != 4; ++i) {
5767    int Idx = PermMask[i];
5768    if (Idx < 0) {
5769      Locs[i] = std::make_pair(-1, -1);
5770    } else {
5771      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
5772      if (Idx < 4) {
5773        Locs[i] = std::make_pair(0, NumLo);
5774        Mask1[NumLo] = Idx;
5775        NumLo++;
5776      } else {
5777        Locs[i] = std::make_pair(1, NumHi);
5778        if (2+NumHi < 4)
5779          Mask1[2+NumHi] = Idx;
5780        NumHi++;
5781      }
5782    }
5783  }
5784
5785  if (NumLo <= 2 && NumHi <= 2) {
5786    // If no more than two elements come from either vector. This can be
5787    // implemented with two shuffles. First shuffle gather the elements.
5788    // The second shuffle, which takes the first shuffle as both of its
5789    // vector operands, put the elements into the right order.
5790    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
5791
5792    SmallVector<int, 8> Mask2(4U, -1);
5793
5794    for (unsigned i = 0; i != 4; ++i) {
5795      if (Locs[i].first == -1)
5796        continue;
5797      else {
5798        unsigned Idx = (i < 2) ? 0 : 4;
5799        Idx += Locs[i].first * 2 + Locs[i].second;
5800        Mask2[i] = Idx;
5801      }
5802    }
5803
5804    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
5805  } else if (NumLo == 3 || NumHi == 3) {
5806    // Otherwise, we must have three elements from one vector, call it X, and
5807    // one element from the other, call it Y.  First, use a shufps to build an
5808    // intermediate vector with the one element from Y and the element from X
5809    // that will be in the same half in the final destination (the indexes don't
5810    // matter). Then, use a shufps to build the final vector, taking the half
5811    // containing the element from Y from the intermediate, and the other half
5812    // from X.
5813    if (NumHi == 3) {
5814      // Normalize it so the 3 elements come from V1.
5815      CommuteVectorShuffleMask(PermMask, VT);
5816      std::swap(V1, V2);
5817    }
5818
5819    // Find the element from V2.
5820    unsigned HiIndex;
5821    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
5822      int Val = PermMask[HiIndex];
5823      if (Val < 0)
5824        continue;
5825      if (Val >= 4)
5826        break;
5827    }
5828
5829    Mask1[0] = PermMask[HiIndex];
5830    Mask1[1] = -1;
5831    Mask1[2] = PermMask[HiIndex^1];
5832    Mask1[3] = -1;
5833    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
5834
5835    if (HiIndex >= 2) {
5836      Mask1[0] = PermMask[0];
5837      Mask1[1] = PermMask[1];
5838      Mask1[2] = HiIndex & 1 ? 6 : 4;
5839      Mask1[3] = HiIndex & 1 ? 4 : 6;
5840      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
5841    } else {
5842      Mask1[0] = HiIndex & 1 ? 2 : 0;
5843      Mask1[1] = HiIndex & 1 ? 0 : 2;
5844      Mask1[2] = PermMask[2];
5845      Mask1[3] = PermMask[3];
5846      if (Mask1[2] >= 0)
5847        Mask1[2] += 4;
5848      if (Mask1[3] >= 0)
5849        Mask1[3] += 4;
5850      return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
5851    }
5852  }
5853
5854  // Break it into (shuffle shuffle_hi, shuffle_lo).
5855  Locs.clear();
5856  Locs.resize(4);
5857  SmallVector<int,8> LoMask(4U, -1);
5858  SmallVector<int,8> HiMask(4U, -1);
5859
5860  SmallVector<int,8> *MaskPtr = &LoMask;
5861  unsigned MaskIdx = 0;
5862  unsigned LoIdx = 0;
5863  unsigned HiIdx = 2;
5864  for (unsigned i = 0; i != 4; ++i) {
5865    if (i == 2) {
5866      MaskPtr = &HiMask;
5867      MaskIdx = 1;
5868      LoIdx = 0;
5869      HiIdx = 2;
5870    }
5871    int Idx = PermMask[i];
5872    if (Idx < 0) {
5873      Locs[i] = std::make_pair(-1, -1);
5874    } else if (Idx < 4) {
5875      Locs[i] = std::make_pair(MaskIdx, LoIdx);
5876      (*MaskPtr)[LoIdx] = Idx;
5877      LoIdx++;
5878    } else {
5879      Locs[i] = std::make_pair(MaskIdx, HiIdx);
5880      (*MaskPtr)[HiIdx] = Idx;
5881      HiIdx++;
5882    }
5883  }
5884
5885  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
5886  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
5887  SmallVector<int, 8> MaskOps;
5888  for (unsigned i = 0; i != 4; ++i) {
5889    if (Locs[i].first == -1) {
5890      MaskOps.push_back(-1);
5891    } else {
5892      unsigned Idx = Locs[i].first * 4 + Locs[i].second;
5893      MaskOps.push_back(Idx);
5894    }
5895  }
5896  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
5897}
5898
5899static bool MayFoldVectorLoad(SDValue V) {
5900  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
5901    V = V.getOperand(0);
5902  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5903    V = V.getOperand(0);
5904  if (MayFoldLoad(V))
5905    return true;
5906  return false;
5907}
5908
5909// FIXME: the version above should always be used. Since there's
5910// a bug where several vector shuffles can't be folded because the
5911// DAG is not updated during lowering and a node claims to have two
5912// uses while it only has one, use this version, and let isel match
5913// another instruction if the load really happens to have more than
5914// one use. Remove this version after this bug get fixed.
5915// rdar://8434668, PR8156
5916static bool RelaxedMayFoldVectorLoad(SDValue V) {
5917  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
5918    V = V.getOperand(0);
5919  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5920    V = V.getOperand(0);
5921  if (ISD::isNormalLoad(V.getNode()))
5922    return true;
5923  return false;
5924}
5925
5926/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by
5927/// a vector extract, and if both can be later optimized into a single load.
5928/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked
5929/// here because otherwise a target specific shuffle node is going to be
5930/// emitted for this shuffle, and the optimization not done.
5931/// FIXME: This is probably not the best approach, but fix the problem
5932/// until the right path is decided.
5933static
5934bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG,
5935                                         const TargetLowering &TLI) {
5936  EVT VT = V.getValueType();
5937  ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V);
5938
5939  // Be sure that the vector shuffle is present in a pattern like this:
5940  // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr)
5941  if (!V.hasOneUse())
5942    return false;
5943
5944  SDNode *N = *V.getNode()->use_begin();
5945  if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5946    return false;
5947
5948  SDValue EltNo = N->getOperand(1);
5949  if (!isa<ConstantSDNode>(EltNo))
5950    return false;
5951
5952  // If the bit convert changed the number of elements, it is unsafe
5953  // to examine the mask.
5954  bool HasShuffleIntoBitcast = false;
5955  if (V.getOpcode() == ISD::BITCAST) {
5956    EVT SrcVT = V.getOperand(0).getValueType();
5957    if (SrcVT.getVectorNumElements() != VT.getVectorNumElements())
5958      return false;
5959    V = V.getOperand(0);
5960    HasShuffleIntoBitcast = true;
5961  }
5962
5963  // Select the input vector, guarding against out of range extract vector.
5964  unsigned NumElems = VT.getVectorNumElements();
5965  unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
5966  int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt);
5967  V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1);
5968
5969  // Skip one more bit_convert if necessary
5970  if (V.getOpcode() == ISD::BITCAST)
5971    V = V.getOperand(0);
5972
5973  if (ISD::isNormalLoad(V.getNode())) {
5974    // Is the original load suitable?
5975    LoadSDNode *LN0 = cast<LoadSDNode>(V);
5976
5977    // FIXME: avoid the multi-use bug that is preventing lots of
5978    // of foldings to be detected, this is still wrong of course, but
5979    // give the temporary desired behavior, and if it happens that
5980    // the load has real more uses, during isel it will not fold, and
5981    // will generate poor code.
5982    if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse()
5983      return false;
5984
5985    if (!HasShuffleIntoBitcast)
5986      return true;
5987
5988    // If there's a bitcast before the shuffle, check if the load type and
5989    // alignment is valid.
5990    unsigned Align = LN0->getAlignment();
5991    unsigned NewAlign =
5992      TLI.getTargetData()->getABITypeAlignment(
5993                                    VT.getTypeForEVT(*DAG.getContext()));
5994
5995    if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
5996      return false;
5997  }
5998
5999  return true;
6000}
6001
6002static
6003SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
6004  EVT VT = Op.getValueType();
6005
6006  // Canonizalize to v2f64.
6007  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
6008  return DAG.getNode(ISD::BITCAST, dl, VT,
6009                     getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
6010                                          V1, DAG));
6011}
6012
6013static
6014SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
6015                        bool HasSSE2) {
6016  SDValue V1 = Op.getOperand(0);
6017  SDValue V2 = Op.getOperand(1);
6018  EVT VT = Op.getValueType();
6019
6020  assert(VT != MVT::v2i64 && "unsupported shuffle type");
6021
6022  if (HasSSE2 && VT == MVT::v2f64)
6023    return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
6024
6025  // v4f32 or v4i32
6026  return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG);
6027}
6028
6029static
6030SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
6031  SDValue V1 = Op.getOperand(0);
6032  SDValue V2 = Op.getOperand(1);
6033  EVT VT = Op.getValueType();
6034
6035  assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
6036         "unsupported shuffle type");
6037
6038  if (V2.getOpcode() == ISD::UNDEF)
6039    V2 = V1;
6040
6041  // v4i32 or v4f32
6042  return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
6043}
6044
6045static
6046SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
6047  SDValue V1 = Op.getOperand(0);
6048  SDValue V2 = Op.getOperand(1);
6049  EVT VT = Op.getValueType();
6050  unsigned NumElems = VT.getVectorNumElements();
6051
6052  // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
6053  // operand of these instructions is only memory, so check if there's a
6054  // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
6055  // same masks.
6056  bool CanFoldLoad = false;
6057
6058  // Trivial case, when V2 comes from a load.
6059  if (MayFoldVectorLoad(V2))
6060    CanFoldLoad = true;
6061
6062  // When V1 is a load, it can be folded later into a store in isel, example:
6063  //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
6064  //    turns into:
6065  //  (MOVLPSmr addr:$src1, VR128:$src2)
6066  // So, recognize this potential and also use MOVLPS or MOVLPD
6067  if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
6068    CanFoldLoad = true;
6069
6070  // Both of them can't be memory operations though.
6071  if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2))
6072    CanFoldLoad = false;
6073
6074  if (CanFoldLoad) {
6075    if (HasSSE2 && NumElems == 2)
6076      return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
6077
6078    if (NumElems == 4)
6079      return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
6080  }
6081
6082  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6083  // movl and movlp will both match v2i64, but v2i64 is never matched by
6084  // movl earlier because we make it strict to avoid messing with the movlp load
6085  // folding logic (see the code above getMOVLP call). Match it here then,
6086  // this is horrible, but will stay like this until we move all shuffle
6087  // matching to x86 specific nodes. Note that for the 1st condition all
6088  // types are matched with movsd.
6089  if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp))
6090    return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
6091  else if (HasSSE2)
6092    return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
6093
6094
6095  assert(VT != MVT::v4i32 && "unsupported shuffle type");
6096
6097  // Invert the operand order and use SHUFPS to match it.
6098  return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1,
6099                              X86::getShuffleSHUFImmediate(SVOp), DAG);
6100}
6101
6102static inline unsigned getUNPCKLOpcode(EVT VT) {
6103  switch(VT.getSimpleVT().SimpleTy) {
6104  case MVT::v4i32: return X86ISD::PUNPCKLDQ;
6105  case MVT::v2i64: return X86ISD::PUNPCKLQDQ;
6106  case MVT::v4f32: return X86ISD::UNPCKLPS;
6107  case MVT::v2f64: return X86ISD::UNPCKLPD;
6108  case MVT::v8i32: // Use fp unit for int unpack.
6109  case MVT::v8f32: return X86ISD::VUNPCKLPSY;
6110  case MVT::v4i64: // Use fp unit for int unpack.
6111  case MVT::v4f64: return X86ISD::VUNPCKLPDY;
6112  case MVT::v16i8: return X86ISD::PUNPCKLBW;
6113  case MVT::v8i16: return X86ISD::PUNPCKLWD;
6114  default:
6115    llvm_unreachable("Unknown type for unpckl");
6116  }
6117  return 0;
6118}
6119
6120static inline unsigned getUNPCKHOpcode(EVT VT) {
6121  switch(VT.getSimpleVT().SimpleTy) {
6122  case MVT::v4i32: return X86ISD::PUNPCKHDQ;
6123  case MVT::v2i64: return X86ISD::PUNPCKHQDQ;
6124  case MVT::v4f32: return X86ISD::UNPCKHPS;
6125  case MVT::v2f64: return X86ISD::UNPCKHPD;
6126  case MVT::v8i32: // Use fp unit for int unpack.
6127  case MVT::v8f32: return X86ISD::VUNPCKHPSY;
6128  case MVT::v4i64: // Use fp unit for int unpack.
6129  case MVT::v4f64: return X86ISD::VUNPCKHPDY;
6130  case MVT::v16i8: return X86ISD::PUNPCKHBW;
6131  case MVT::v8i16: return X86ISD::PUNPCKHWD;
6132  default:
6133    llvm_unreachable("Unknown type for unpckh");
6134  }
6135  return 0;
6136}
6137
6138static inline unsigned getVPERMILOpcode(EVT VT) {
6139  switch(VT.getSimpleVT().SimpleTy) {
6140  case MVT::v4i32:
6141  case MVT::v4f32: return X86ISD::VPERMILPS;
6142  case MVT::v2i64:
6143  case MVT::v2f64: return X86ISD::VPERMILPD;
6144  case MVT::v8i32:
6145  case MVT::v8f32: return X86ISD::VPERMILPSY;
6146  case MVT::v4i64:
6147  case MVT::v4f64: return X86ISD::VPERMILPDY;
6148  default:
6149    llvm_unreachable("Unknown type for vpermil");
6150  }
6151  return 0;
6152}
6153
6154/// isVectorBroadcast - Check if the node chain is suitable to be xformed to
6155/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming
6156/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded.
6157static bool isVectorBroadcast(SDValue &Op) {
6158  EVT VT = Op.getValueType();
6159  bool Is256 = VT.getSizeInBits() == 256;
6160
6161  assert((VT.getSizeInBits() == 128 || Is256) &&
6162         "Unsupported type for vbroadcast node");
6163
6164  SDValue V = Op;
6165  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
6166    V = V.getOperand(0);
6167
6168  if (Is256 && !(V.hasOneUse() &&
6169                 V.getOpcode() == ISD::INSERT_SUBVECTOR &&
6170                 V.getOperand(0).getOpcode() == ISD::UNDEF))
6171    return false;
6172
6173  if (Is256)
6174    V = V.getOperand(1);
6175  if (V.hasOneUse() && V.getOpcode() != ISD::SCALAR_TO_VECTOR)
6176    return false;
6177
6178  // Check the source scalar_to_vector type. 256-bit broadcasts are
6179  // supported for 32/64-bit sizes, while 128-bit ones are only supported
6180  // for 32-bit scalars.
6181  unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits();
6182  if (ScalarSize != 32 && ScalarSize != 64)
6183    return false;
6184  if (!Is256 && ScalarSize == 64)
6185    return false;
6186
6187  V = V.getOperand(0);
6188  if (!MayFoldLoad(V))
6189    return false;
6190
6191  // Return the load node
6192  Op = V;
6193  return true;
6194}
6195
6196static
6197SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
6198                               const TargetLowering &TLI,
6199                               const X86Subtarget *Subtarget) {
6200  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6201  EVT VT = Op.getValueType();
6202  DebugLoc dl = Op.getDebugLoc();
6203  SDValue V1 = Op.getOperand(0);
6204  SDValue V2 = Op.getOperand(1);
6205
6206  if (isZeroShuffle(SVOp))
6207    return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
6208
6209  // Handle splat operations
6210  if (SVOp->isSplat()) {
6211    unsigned NumElem = VT.getVectorNumElements();
6212    // Special case, this is the only place now where it's allowed to return
6213    // a vector_shuffle operation without using a target specific node, because
6214    // *hopefully* it will be optimized away by the dag combiner. FIXME: should
6215    // this be moved to DAGCombine instead?
6216    if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
6217      return Op;
6218
6219    // Use vbroadcast whenever the splat comes from a foldable load
6220    if (Subtarget->hasAVX() && isVectorBroadcast(V1))
6221      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1);
6222
6223    // Handle splats by matching through known shuffle masks
6224    if (VT.is128BitVector() && NumElem <= 4)
6225      return SDValue();
6226
6227    // All remaning splats are promoted to target supported vector shuffles.
6228    return PromoteSplat(SVOp, DAG);
6229  }
6230
6231  // If the shuffle can be profitably rewritten as a narrower shuffle, then
6232  // do it!
6233  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
6234    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
6235    if (NewOp.getNode())
6236      return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
6237  } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
6238    // FIXME: Figure out a cleaner way to do this.
6239    // Try to make use of movq to zero out the top part.
6240    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
6241      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
6242      if (NewOp.getNode()) {
6243        if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false))
6244          return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0),
6245                              DAG, Subtarget, dl);
6246      }
6247    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
6248      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
6249      if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)))
6250        return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
6251                            DAG, Subtarget, dl);
6252    }
6253  }
6254  return SDValue();
6255}
6256
6257SDValue
6258X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
6259  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6260  SDValue V1 = Op.getOperand(0);
6261  SDValue V2 = Op.getOperand(1);
6262  EVT VT = Op.getValueType();
6263  DebugLoc dl = Op.getDebugLoc();
6264  unsigned NumElems = VT.getVectorNumElements();
6265  bool isMMX = VT.getSizeInBits() == 64;
6266  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
6267  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
6268  bool V1IsSplat = false;
6269  bool V2IsSplat = false;
6270  bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX();
6271  bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX();
6272  bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX();
6273  MachineFunction &MF = DAG.getMachineFunction();
6274  bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
6275
6276  // Shuffle operations on MMX not supported.
6277  if (isMMX)
6278    return Op;
6279
6280  // Vector shuffle lowering takes 3 steps:
6281  //
6282  // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
6283  //    narrowing and commutation of operands should be handled.
6284  // 2) Matching of shuffles with known shuffle masks to x86 target specific
6285  //    shuffle nodes.
6286  // 3) Rewriting of unmatched masks into new generic shuffle operations,
6287  //    so the shuffle can be broken into other shuffles and the legalizer can
6288  //    try the lowering again.
6289  //
6290  // The general ideia is that no vector_shuffle operation should be left to
6291  // be matched during isel, all of them must be converted to a target specific
6292  // node here.
6293
6294  // Normalize the input vectors. Here splats, zeroed vectors, profitable
6295  // narrowing and commutation of operands should be handled. The actual code
6296  // doesn't include all of those, work in progress...
6297  SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget);
6298  if (NewOp.getNode())
6299    return NewOp;
6300
6301  // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
6302  // unpckh_undef). Only use pshufd if speed is more important than size.
6303  if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp))
6304    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
6305  if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp))
6306    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
6307
6308  if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef &&
6309      RelaxedMayFoldVectorLoad(V1))
6310    return getMOVDDup(Op, dl, V1, DAG);
6311
6312  if (X86::isMOVHLPS_v_undef_Mask(SVOp))
6313    return getMOVHighToLow(Op, dl, DAG);
6314
6315  // Use to match splats
6316  if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef &&
6317      (VT == MVT::v2f64 || VT == MVT::v2i64))
6318    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
6319
6320  if (X86::isPSHUFDMask(SVOp)) {
6321    // The actual implementation will match the mask in the if above and then
6322    // during isel it can match several different instructions, not only pshufd
6323    // as its name says, sad but true, emulate the behavior for now...
6324    if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
6325        return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
6326
6327    unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
6328
6329    if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
6330      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
6331
6332    if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
6333      return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1,
6334                                  TargetMask, DAG);
6335
6336    if (VT == MVT::v4f32)
6337      return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1,
6338                                  TargetMask, DAG);
6339  }
6340
6341  // Check if this can be converted into a logical shift.
6342  bool isLeft = false;
6343  unsigned ShAmt = 0;
6344  SDValue ShVal;
6345  bool isShift = getSubtarget()->hasSSE2() &&
6346    isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
6347  if (isShift && ShVal.hasOneUse()) {
6348    // If the shifted value has multiple uses, it may be cheaper to use
6349    // v_set0 + movlhps or movhlps, etc.
6350    EVT EltVT = VT.getVectorElementType();
6351    ShAmt *= EltVT.getSizeInBits();
6352    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
6353  }
6354
6355  if (X86::isMOVLMask(SVOp)) {
6356    if (V1IsUndef)
6357      return V2;
6358    if (ISD::isBuildVectorAllZeros(V1.getNode()))
6359      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
6360    if (!X86::isMOVLPMask(SVOp)) {
6361      if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
6362        return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
6363
6364      if (VT == MVT::v4i32 || VT == MVT::v4f32)
6365        return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
6366    }
6367  }
6368
6369  // FIXME: fold these into legal mask.
6370  if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp))
6371    return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
6372
6373  if (X86::isMOVHLPSMask(SVOp))
6374    return getMOVHighToLow(Op, dl, DAG);
6375
6376  if (X86::isMOVSHDUPMask(SVOp, Subtarget))
6377    return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
6378
6379  if (X86::isMOVSLDUPMask(SVOp, Subtarget))
6380    return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
6381
6382  if (X86::isMOVLPMask(SVOp))
6383    return getMOVLP(Op, dl, DAG, HasSSE2);
6384
6385  if (ShouldXformToMOVHLPS(SVOp) ||
6386      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
6387    return CommuteVectorShuffle(SVOp, DAG);
6388
6389  if (isShift) {
6390    // No better options. Use a vshl / vsrl.
6391    EVT EltVT = VT.getVectorElementType();
6392    ShAmt *= EltVT.getSizeInBits();
6393    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
6394  }
6395
6396  bool Commuted = false;
6397  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
6398  // 1,1,1,1 -> v8i16 though.
6399  V1IsSplat = isSplatVector(V1.getNode());
6400  V2IsSplat = isSplatVector(V2.getNode());
6401
6402  // Canonicalize the splat or undef, if present, to be on the RHS.
6403  if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
6404    Op = CommuteVectorShuffle(SVOp, DAG);
6405    SVOp = cast<ShuffleVectorSDNode>(Op);
6406    V1 = SVOp->getOperand(0);
6407    V2 = SVOp->getOperand(1);
6408    std::swap(V1IsSplat, V2IsSplat);
6409    std::swap(V1IsUndef, V2IsUndef);
6410    Commuted = true;
6411  }
6412
6413  if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
6414    // Shuffling low element of v1 into undef, just return v1.
6415    if (V2IsUndef)
6416      return V1;
6417    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
6418    // the instruction selector will not match, so get a canonical MOVL with
6419    // swapped operands to undo the commute.
6420    return getMOVL(DAG, dl, VT, V2, V1);
6421  }
6422
6423  if (X86::isUNPCKLMask(SVOp))
6424    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG);
6425
6426  if (X86::isUNPCKHMask(SVOp))
6427    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG);
6428
6429  if (V2IsSplat) {
6430    // Normalize mask so all entries that point to V2 points to its first
6431    // element then try to match unpck{h|l} again. If match, return a
6432    // new vector_shuffle with the corrected mask.
6433    SDValue NewMask = NormalizeMask(SVOp, DAG);
6434    ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
6435    if (NSVOp != SVOp) {
6436      if (X86::isUNPCKLMask(NSVOp, true)) {
6437        return NewMask;
6438      } else if (X86::isUNPCKHMask(NSVOp, true)) {
6439        return NewMask;
6440      }
6441    }
6442  }
6443
6444  if (Commuted) {
6445    // Commute is back and try unpck* again.
6446    // FIXME: this seems wrong.
6447    SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
6448    ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
6449
6450    if (X86::isUNPCKLMask(NewSVOp))
6451      return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG);
6452
6453    if (X86::isUNPCKHMask(NewSVOp))
6454      return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG);
6455  }
6456
6457  // Normalize the node to match x86 shuffle ops if needed
6458  if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
6459    return CommuteVectorShuffle(SVOp, DAG);
6460
6461  // The checks below are all present in isShuffleMaskLegal, but they are
6462  // inlined here right now to enable us to directly emit target specific
6463  // nodes, and remove one by one until they don't return Op anymore.
6464  SmallVector<int, 16> M;
6465  SVOp->getMask(M);
6466
6467  if (isPALIGNRMask(M, VT, HasSSSE3))
6468    return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
6469                                X86::getShufflePALIGNRImmediate(SVOp),
6470                                DAG);
6471
6472  if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
6473      SVOp->getSplatIndex() == 0 && V2IsUndef) {
6474    if (VT == MVT::v2f64)
6475      return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG);
6476    if (VT == MVT::v2i64)
6477      return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG);
6478  }
6479
6480  if (isPSHUFHWMask(M, VT))
6481    return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
6482                                X86::getShufflePSHUFHWImmediate(SVOp),
6483                                DAG);
6484
6485  if (isPSHUFLWMask(M, VT))
6486    return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
6487                                X86::getShufflePSHUFLWImmediate(SVOp),
6488                                DAG);
6489
6490  if (isSHUFPMask(M, VT)) {
6491    unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
6492    if (VT == MVT::v4f32 || VT == MVT::v4i32)
6493      return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2,
6494                                  TargetMask, DAG);
6495    if (VT == MVT::v2f64 || VT == MVT::v2i64)
6496      return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2,
6497                                  TargetMask, DAG);
6498  }
6499
6500  if (X86::isUNPCKL_v_undef_Mask(SVOp))
6501    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
6502  if (X86::isUNPCKH_v_undef_Mask(SVOp))
6503    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
6504
6505  //===--------------------------------------------------------------------===//
6506  // Generate target specific nodes for 128 or 256-bit shuffles only
6507  // supported in the AVX instruction set.
6508  //
6509
6510  // Handle VPERMILPS* permutations
6511  if (isVPERMILPSMask(M, VT, Subtarget))
6512    return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
6513                                getShuffleVPERMILPSImmediate(SVOp), DAG);
6514
6515  // Handle VPERMILPD* permutations
6516  if (isVPERMILPDMask(M, VT, Subtarget))
6517    return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
6518                                getShuffleVPERMILPDImmediate(SVOp), DAG);
6519
6520  // Handle VPERM2F128 permutations
6521  if (isVPERM2F128Mask(M, VT, Subtarget))
6522    return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2,
6523                                getShuffleVPERM2F128Immediate(SVOp), DAG);
6524
6525  //===--------------------------------------------------------------------===//
6526  // Since no target specific shuffle was selected for this generic one,
6527  // lower it into other known shuffles. FIXME: this isn't true yet, but
6528  // this is the plan.
6529  //
6530
6531  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
6532  if (VT == MVT::v8i16) {
6533    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG);
6534    if (NewOp.getNode())
6535      return NewOp;
6536  }
6537
6538  if (VT == MVT::v16i8) {
6539    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
6540    if (NewOp.getNode())
6541      return NewOp;
6542  }
6543
6544  // Handle all 128-bit wide vectors with 4 elements, and match them with
6545  // several different shuffle types.
6546  if (NumElems == 4 && VT.getSizeInBits() == 128)
6547    return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
6548
6549  // Handle general 256-bit shuffles
6550  if (VT.is256BitVector())
6551    return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
6552
6553  return SDValue();
6554}
6555
6556SDValue
6557X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
6558                                                SelectionDAG &DAG) const {
6559  EVT VT = Op.getValueType();
6560  DebugLoc dl = Op.getDebugLoc();
6561
6562  if (Op.getOperand(0).getValueType().getSizeInBits() != 128)
6563    return SDValue();
6564
6565  if (VT.getSizeInBits() == 8) {
6566    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
6567                                    Op.getOperand(0), Op.getOperand(1));
6568    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
6569                                    DAG.getValueType(VT));
6570    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
6571  } else if (VT.getSizeInBits() == 16) {
6572    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
6573    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
6574    if (Idx == 0)
6575      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
6576                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
6577                                     DAG.getNode(ISD::BITCAST, dl,
6578                                                 MVT::v4i32,
6579                                                 Op.getOperand(0)),
6580                                     Op.getOperand(1)));
6581    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
6582                                    Op.getOperand(0), Op.getOperand(1));
6583    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
6584                                    DAG.getValueType(VT));
6585    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
6586  } else if (VT == MVT::f32) {
6587    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
6588    // the result back to FR32 register. It's only worth matching if the
6589    // result has a single use which is a store or a bitcast to i32.  And in
6590    // the case of a store, it's not worth it if the index is a constant 0,
6591    // because a MOVSSmr can be used instead, which is smaller and faster.
6592    if (!Op.hasOneUse())
6593      return SDValue();
6594    SDNode *User = *Op.getNode()->use_begin();
6595    if ((User->getOpcode() != ISD::STORE ||
6596         (isa<ConstantSDNode>(Op.getOperand(1)) &&
6597          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
6598        (User->getOpcode() != ISD::BITCAST ||
6599         User->getValueType(0) != MVT::i32))
6600      return SDValue();
6601    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
6602                                  DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
6603                                              Op.getOperand(0)),
6604                                              Op.getOperand(1));
6605    return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
6606  } else if (VT == MVT::i32) {
6607    // ExtractPS works with constant index.
6608    if (isa<ConstantSDNode>(Op.getOperand(1)))
6609      return Op;
6610  }
6611  return SDValue();
6612}
6613
6614
6615SDValue
6616X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
6617                                           SelectionDAG &DAG) const {
6618  if (!isa<ConstantSDNode>(Op.getOperand(1)))
6619    return SDValue();
6620
6621  SDValue Vec = Op.getOperand(0);
6622  EVT VecVT = Vec.getValueType();
6623
6624  // If this is a 256-bit vector result, first extract the 128-bit vector and
6625  // then extract the element from the 128-bit vector.
6626  if (VecVT.getSizeInBits() == 256) {
6627    DebugLoc dl = Op.getNode()->getDebugLoc();
6628    unsigned NumElems = VecVT.getVectorNumElements();
6629    SDValue Idx = Op.getOperand(1);
6630    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
6631
6632    // Get the 128-bit vector.
6633    bool Upper = IdxVal >= NumElems/2;
6634    Vec = Extract128BitVector(Vec,
6635                    DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl);
6636
6637    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
6638                    Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx);
6639  }
6640
6641  assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
6642
6643  if (Subtarget->hasSSE41() || Subtarget->hasAVX()) {
6644    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
6645    if (Res.getNode())
6646      return Res;
6647  }
6648
6649  EVT VT = Op.getValueType();
6650  DebugLoc dl = Op.getDebugLoc();
6651  // TODO: handle v16i8.
6652  if (VT.getSizeInBits() == 16) {
6653    SDValue Vec = Op.getOperand(0);
6654    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
6655    if (Idx == 0)
6656      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
6657                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
6658                                     DAG.getNode(ISD::BITCAST, dl,
6659                                                 MVT::v4i32, Vec),
6660                                     Op.getOperand(1)));
6661    // Transform it so it match pextrw which produces a 32-bit result.
6662    EVT EltVT = MVT::i32;
6663    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
6664                                    Op.getOperand(0), Op.getOperand(1));
6665    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
6666                                    DAG.getValueType(VT));
6667    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
6668  } else if (VT.getSizeInBits() == 32) {
6669    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
6670    if (Idx == 0)
6671      return Op;
6672
6673    // SHUFPS the element to the lowest double word, then movss.
6674    int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
6675    EVT VVT = Op.getOperand(0).getValueType();
6676    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
6677                                       DAG.getUNDEF(VVT), Mask);
6678    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
6679                       DAG.getIntPtrConstant(0));
6680  } else if (VT.getSizeInBits() == 64) {
6681    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
6682    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
6683    //        to match extract_elt for f64.
6684    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
6685    if (Idx == 0)
6686      return Op;
6687
6688    // UNPCKHPD the element to the lowest double word, then movsd.
6689    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
6690    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
6691    int Mask[2] = { 1, -1 };
6692    EVT VVT = Op.getOperand(0).getValueType();
6693    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
6694                                       DAG.getUNDEF(VVT), Mask);
6695    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
6696                       DAG.getIntPtrConstant(0));
6697  }
6698
6699  return SDValue();
6700}
6701
6702SDValue
6703X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
6704                                               SelectionDAG &DAG) const {
6705  EVT VT = Op.getValueType();
6706  EVT EltVT = VT.getVectorElementType();
6707  DebugLoc dl = Op.getDebugLoc();
6708
6709  SDValue N0 = Op.getOperand(0);
6710  SDValue N1 = Op.getOperand(1);
6711  SDValue N2 = Op.getOperand(2);
6712
6713  if (VT.getSizeInBits() == 256)
6714    return SDValue();
6715
6716  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
6717      isa<ConstantSDNode>(N2)) {
6718    unsigned Opc;
6719    if (VT == MVT::v8i16)
6720      Opc = X86ISD::PINSRW;
6721    else if (VT == MVT::v16i8)
6722      Opc = X86ISD::PINSRB;
6723    else
6724      Opc = X86ISD::PINSRB;
6725
6726    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
6727    // argument.
6728    if (N1.getValueType() != MVT::i32)
6729      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
6730    if (N2.getValueType() != MVT::i32)
6731      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
6732    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
6733  } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
6734    // Bits [7:6] of the constant are the source select.  This will always be
6735    //  zero here.  The DAG Combiner may combine an extract_elt index into these
6736    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
6737    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
6738    // Bits [5:4] of the constant are the destination select.  This is the
6739    //  value of the incoming immediate.
6740    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
6741    //   combine either bitwise AND or insert of float 0.0 to set these bits.
6742    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
6743    // Create this as a scalar to vector..
6744    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
6745    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
6746  } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) {
6747    // PINSR* works with constant index.
6748    return Op;
6749  }
6750  return SDValue();
6751}
6752
6753SDValue
6754X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
6755  EVT VT = Op.getValueType();
6756  EVT EltVT = VT.getVectorElementType();
6757
6758  DebugLoc dl = Op.getDebugLoc();
6759  SDValue N0 = Op.getOperand(0);
6760  SDValue N1 = Op.getOperand(1);
6761  SDValue N2 = Op.getOperand(2);
6762
6763  // If this is a 256-bit vector result, first extract the 128-bit vector,
6764  // insert the element into the extracted half and then place it back.
6765  if (VT.getSizeInBits() == 256) {
6766    if (!isa<ConstantSDNode>(N2))
6767      return SDValue();
6768
6769    // Get the desired 128-bit vector half.
6770    unsigned NumElems = VT.getVectorNumElements();
6771    unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
6772    bool Upper = IdxVal >= NumElems/2;
6773    SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32);
6774    SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl);
6775
6776    // Insert the element into the desired half.
6777    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V,
6778                 N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2);
6779
6780    // Insert the changed part back to the 256-bit vector
6781    return Insert128BitVector(N0, V, Ins128Idx, DAG, dl);
6782  }
6783
6784  if (Subtarget->hasSSE41() || Subtarget->hasAVX())
6785    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
6786
6787  if (EltVT == MVT::i8)
6788    return SDValue();
6789
6790  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
6791    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
6792    // as its second argument.
6793    if (N1.getValueType() != MVT::i32)
6794      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
6795    if (N2.getValueType() != MVT::i32)
6796      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
6797    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
6798  }
6799  return SDValue();
6800}
6801
6802SDValue
6803X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6804  LLVMContext *Context = DAG.getContext();
6805  DebugLoc dl = Op.getDebugLoc();
6806  EVT OpVT = Op.getValueType();
6807
6808  // If this is a 256-bit vector result, first insert into a 128-bit
6809  // vector and then insert into the 256-bit vector.
6810  if (OpVT.getSizeInBits() > 128) {
6811    // Insert into a 128-bit vector.
6812    EVT VT128 = EVT::getVectorVT(*Context,
6813                                 OpVT.getVectorElementType(),
6814                                 OpVT.getVectorNumElements() / 2);
6815
6816    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
6817
6818    // Insert the 128-bit vector.
6819    return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op,
6820                              DAG.getConstant(0, MVT::i32),
6821                              DAG, dl);
6822  }
6823
6824  if (Op.getValueType() == MVT::v1i64 &&
6825      Op.getOperand(0).getValueType() == MVT::i64)
6826    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
6827
6828  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
6829  assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 &&
6830         "Expected an SSE type!");
6831  return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(),
6832                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
6833}
6834
6835// Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
6836// a simple subregister reference or explicit instructions to grab
6837// upper bits of a vector.
6838SDValue
6839X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
6840  if (Subtarget->hasAVX()) {
6841    DebugLoc dl = Op.getNode()->getDebugLoc();
6842    SDValue Vec = Op.getNode()->getOperand(0);
6843    SDValue Idx = Op.getNode()->getOperand(1);
6844
6845    if (Op.getNode()->getValueType(0).getSizeInBits() == 128
6846        && Vec.getNode()->getValueType(0).getSizeInBits() == 256) {
6847        return Extract128BitVector(Vec, Idx, DAG, dl);
6848    }
6849  }
6850  return SDValue();
6851}
6852
6853// Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
6854// simple superregister reference or explicit instructions to insert
6855// the upper bits of a vector.
6856SDValue
6857X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
6858  if (Subtarget->hasAVX()) {
6859    DebugLoc dl = Op.getNode()->getDebugLoc();
6860    SDValue Vec = Op.getNode()->getOperand(0);
6861    SDValue SubVec = Op.getNode()->getOperand(1);
6862    SDValue Idx = Op.getNode()->getOperand(2);
6863
6864    if (Op.getNode()->getValueType(0).getSizeInBits() == 256
6865        && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) {
6866      return Insert128BitVector(Vec, SubVec, Idx, DAG, dl);
6867    }
6868  }
6869  return SDValue();
6870}
6871
6872// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
6873// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
6874// one of the above mentioned nodes. It has to be wrapped because otherwise
6875// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
6876// be used to form addressing mode. These wrapped nodes will be selected
6877// into MOV32ri.
6878SDValue
6879X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
6880  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
6881
6882  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
6883  // global base reg.
6884  unsigned char OpFlag = 0;
6885  unsigned WrapperKind = X86ISD::Wrapper;
6886  CodeModel::Model M = getTargetMachine().getCodeModel();
6887
6888  if (Subtarget->isPICStyleRIPRel() &&
6889      (M == CodeModel::Small || M == CodeModel::Kernel))
6890    WrapperKind = X86ISD::WrapperRIP;
6891  else if (Subtarget->isPICStyleGOT())
6892    OpFlag = X86II::MO_GOTOFF;
6893  else if (Subtarget->isPICStyleStubPIC())
6894    OpFlag = X86II::MO_PIC_BASE_OFFSET;
6895
6896  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
6897                                             CP->getAlignment(),
6898                                             CP->getOffset(), OpFlag);
6899  DebugLoc DL = CP->getDebugLoc();
6900  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
6901  // With PIC, the address is actually $g + Offset.
6902  if (OpFlag) {
6903    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
6904                         DAG.getNode(X86ISD::GlobalBaseReg,
6905                                     DebugLoc(), getPointerTy()),
6906                         Result);
6907  }
6908
6909  return Result;
6910}
6911
6912SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
6913  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
6914
6915  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
6916  // global base reg.
6917  unsigned char OpFlag = 0;
6918  unsigned WrapperKind = X86ISD::Wrapper;
6919  CodeModel::Model M = getTargetMachine().getCodeModel();
6920
6921  if (Subtarget->isPICStyleRIPRel() &&
6922      (M == CodeModel::Small || M == CodeModel::Kernel))
6923    WrapperKind = X86ISD::WrapperRIP;
6924  else if (Subtarget->isPICStyleGOT())
6925    OpFlag = X86II::MO_GOTOFF;
6926  else if (Subtarget->isPICStyleStubPIC())
6927    OpFlag = X86II::MO_PIC_BASE_OFFSET;
6928
6929  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
6930                                          OpFlag);
6931  DebugLoc DL = JT->getDebugLoc();
6932  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
6933
6934  // With PIC, the address is actually $g + Offset.
6935  if (OpFlag)
6936    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
6937                         DAG.getNode(X86ISD::GlobalBaseReg,
6938                                     DebugLoc(), getPointerTy()),
6939                         Result);
6940
6941  return Result;
6942}
6943
6944SDValue
6945X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
6946  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
6947
6948  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
6949  // global base reg.
6950  unsigned char OpFlag = 0;
6951  unsigned WrapperKind = X86ISD::Wrapper;
6952  CodeModel::Model M = getTargetMachine().getCodeModel();
6953
6954  if (Subtarget->isPICStyleRIPRel() &&
6955      (M == CodeModel::Small || M == CodeModel::Kernel)) {
6956    if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
6957      OpFlag = X86II::MO_GOTPCREL;
6958    WrapperKind = X86ISD::WrapperRIP;
6959  } else if (Subtarget->isPICStyleGOT()) {
6960    OpFlag = X86II::MO_GOT;
6961  } else if (Subtarget->isPICStyleStubPIC()) {
6962    OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
6963  } else if (Subtarget->isPICStyleStubNoDynamic()) {
6964    OpFlag = X86II::MO_DARWIN_NONLAZY;
6965  }
6966
6967  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
6968
6969  DebugLoc DL = Op.getDebugLoc();
6970  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
6971
6972
6973  // With PIC, the address is actually $g + Offset.
6974  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
6975      !Subtarget->is64Bit()) {
6976    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
6977                         DAG.getNode(X86ISD::GlobalBaseReg,
6978                                     DebugLoc(), getPointerTy()),
6979                         Result);
6980  }
6981
6982  // For symbols that require a load from a stub to get the address, emit the
6983  // load.
6984  if (isGlobalStubReference(OpFlag))
6985    Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
6986                         MachinePointerInfo::getGOT(), false, false, 0);
6987
6988  return Result;
6989}
6990
6991SDValue
6992X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
6993  // Create the TargetBlockAddressAddress node.
6994  unsigned char OpFlags =
6995    Subtarget->ClassifyBlockAddressReference();
6996  CodeModel::Model M = getTargetMachine().getCodeModel();
6997  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
6998  DebugLoc dl = Op.getDebugLoc();
6999  SDValue Result = DAG.getBlockAddress(BA, getPointerTy(),
7000                                       /*isTarget=*/true, OpFlags);
7001
7002  if (Subtarget->isPICStyleRIPRel() &&
7003      (M == CodeModel::Small || M == CodeModel::Kernel))
7004    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
7005  else
7006    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
7007
7008  // With PIC, the address is actually $g + Offset.
7009  if (isGlobalRelativeToPICBase(OpFlags)) {
7010    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
7011                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
7012                         Result);
7013  }
7014
7015  return Result;
7016}
7017
7018SDValue
7019X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
7020                                      int64_t Offset,
7021                                      SelectionDAG &DAG) const {
7022  // Create the TargetGlobalAddress node, folding in the constant
7023  // offset if it is legal.
7024  unsigned char OpFlags =
7025    Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
7026  CodeModel::Model M = getTargetMachine().getCodeModel();
7027  SDValue Result;
7028  if (OpFlags == X86II::MO_NO_FLAG &&
7029      X86::isOffsetSuitableForCodeModel(Offset, M)) {
7030    // A direct static reference to a global.
7031    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
7032    Offset = 0;
7033  } else {
7034    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
7035  }
7036
7037  if (Subtarget->isPICStyleRIPRel() &&
7038      (M == CodeModel::Small || M == CodeModel::Kernel))
7039    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
7040  else
7041    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
7042
7043  // With PIC, the address is actually $g + Offset.
7044  if (isGlobalRelativeToPICBase(OpFlags)) {
7045    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
7046                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
7047                         Result);
7048  }
7049
7050  // For globals that require a load from a stub to get the address, emit the
7051  // load.
7052  if (isGlobalStubReference(OpFlags))
7053    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
7054                         MachinePointerInfo::getGOT(), false, false, 0);
7055
7056  // If there was a non-zero offset that we didn't fold, create an explicit
7057  // addition for it.
7058  if (Offset != 0)
7059    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
7060                         DAG.getConstant(Offset, getPointerTy()));
7061
7062  return Result;
7063}
7064
7065SDValue
7066X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
7067  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
7068  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
7069  return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
7070}
7071
7072static SDValue
7073GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
7074           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
7075           unsigned char OperandFlags) {
7076  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7077  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
7078  DebugLoc dl = GA->getDebugLoc();
7079  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7080                                           GA->getValueType(0),
7081                                           GA->getOffset(),
7082                                           OperandFlags);
7083  if (InFlag) {
7084    SDValue Ops[] = { Chain,  TGA, *InFlag };
7085    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
7086  } else {
7087    SDValue Ops[]  = { Chain, TGA };
7088    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
7089  }
7090
7091  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
7092  MFI->setAdjustsStack(true);
7093
7094  SDValue Flag = Chain.getValue(1);
7095  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
7096}
7097
7098// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
7099static SDValue
7100LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
7101                                const EVT PtrVT) {
7102  SDValue InFlag;
7103  DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
7104  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
7105                                     DAG.getNode(X86ISD::GlobalBaseReg,
7106                                                 DebugLoc(), PtrVT), InFlag);
7107  InFlag = Chain.getValue(1);
7108
7109  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
7110}
7111
7112// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
7113static SDValue
7114LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
7115                                const EVT PtrVT) {
7116  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
7117                    X86::RAX, X86II::MO_TLSGD);
7118}
7119
7120// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
7121// "local exec" model.
7122static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
7123                                   const EVT PtrVT, TLSModel::Model model,
7124                                   bool is64Bit) {
7125  DebugLoc dl = GA->getDebugLoc();
7126
7127  // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
7128  Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
7129                                                         is64Bit ? 257 : 256));
7130
7131  SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
7132                                      DAG.getIntPtrConstant(0),
7133                                      MachinePointerInfo(Ptr), false, false, 0);
7134
7135  unsigned char OperandFlags = 0;
7136  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
7137  // initialexec.
7138  unsigned WrapperKind = X86ISD::Wrapper;
7139  if (model == TLSModel::LocalExec) {
7140    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
7141  } else if (is64Bit) {
7142    assert(model == TLSModel::InitialExec);
7143    OperandFlags = X86II::MO_GOTTPOFF;
7144    WrapperKind = X86ISD::WrapperRIP;
7145  } else {
7146    assert(model == TLSModel::InitialExec);
7147    OperandFlags = X86II::MO_INDNTPOFF;
7148  }
7149
7150  // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
7151  // exec)
7152  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7153                                           GA->getValueType(0),
7154                                           GA->getOffset(), OperandFlags);
7155  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
7156
7157  if (model == TLSModel::InitialExec)
7158    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
7159                         MachinePointerInfo::getGOT(), false, false, 0);
7160
7161  // The address of the thread local variable is the add of the thread
7162  // pointer with the offset of the variable.
7163  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
7164}
7165
7166SDValue
7167X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
7168
7169  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
7170  const GlobalValue *GV = GA->getGlobal();
7171
7172  if (Subtarget->isTargetELF()) {
7173    // TODO: implement the "local dynamic" model
7174    // TODO: implement the "initial exec"model for pic executables
7175
7176    // If GV is an alias then use the aliasee for determining
7177    // thread-localness.
7178    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
7179      GV = GA->resolveAliasedGlobal(false);
7180
7181    TLSModel::Model model
7182      = getTLSModel(GV, getTargetMachine().getRelocationModel());
7183
7184    switch (model) {
7185      case TLSModel::GeneralDynamic:
7186      case TLSModel::LocalDynamic: // not implemented
7187        if (Subtarget->is64Bit())
7188          return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
7189        return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
7190
7191      case TLSModel::InitialExec:
7192      case TLSModel::LocalExec:
7193        return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
7194                                   Subtarget->is64Bit());
7195    }
7196  } else if (Subtarget->isTargetDarwin()) {
7197    // Darwin only has one model of TLS.  Lower to that.
7198    unsigned char OpFlag = 0;
7199    unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
7200                           X86ISD::WrapperRIP : X86ISD::Wrapper;
7201
7202    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7203    // global base reg.
7204    bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
7205                  !Subtarget->is64Bit();
7206    if (PIC32)
7207      OpFlag = X86II::MO_TLVP_PIC_BASE;
7208    else
7209      OpFlag = X86II::MO_TLVP;
7210    DebugLoc DL = Op.getDebugLoc();
7211    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
7212                                                GA->getValueType(0),
7213                                                GA->getOffset(), OpFlag);
7214    SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7215
7216    // With PIC32, the address is actually $g + Offset.
7217    if (PIC32)
7218      Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7219                           DAG.getNode(X86ISD::GlobalBaseReg,
7220                                       DebugLoc(), getPointerTy()),
7221                           Offset);
7222
7223    // Lowering the machine isd will make sure everything is in the right
7224    // location.
7225    SDValue Chain = DAG.getEntryNode();
7226    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
7227    SDValue Args[] = { Chain, Offset };
7228    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2);
7229
7230    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
7231    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7232    MFI->setAdjustsStack(true);
7233
7234    // And our return value (tls address) is in the standard call return value
7235    // location.
7236    unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
7237    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
7238  }
7239
7240  assert(false &&
7241         "TLS not implemented for this target.");
7242
7243  llvm_unreachable("Unreachable");
7244  return SDValue();
7245}
7246
7247
7248/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and
7249/// take a 2 x i32 value to shift plus a shift amount.
7250SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const {
7251  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
7252  EVT VT = Op.getValueType();
7253  unsigned VTBits = VT.getSizeInBits();
7254  DebugLoc dl = Op.getDebugLoc();
7255  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
7256  SDValue ShOpLo = Op.getOperand(0);
7257  SDValue ShOpHi = Op.getOperand(1);
7258  SDValue ShAmt  = Op.getOperand(2);
7259  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
7260                                     DAG.getConstant(VTBits - 1, MVT::i8))
7261                       : DAG.getConstant(0, VT);
7262
7263  SDValue Tmp2, Tmp3;
7264  if (Op.getOpcode() == ISD::SHL_PARTS) {
7265    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
7266    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
7267  } else {
7268    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
7269    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
7270  }
7271
7272  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
7273                                DAG.getConstant(VTBits, MVT::i8));
7274  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
7275                             AndNode, DAG.getConstant(0, MVT::i8));
7276
7277  SDValue Hi, Lo;
7278  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
7279  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
7280  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
7281
7282  if (Op.getOpcode() == ISD::SHL_PARTS) {
7283    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
7284    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
7285  } else {
7286    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
7287    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
7288  }
7289
7290  SDValue Ops[2] = { Lo, Hi };
7291  return DAG.getMergeValues(Ops, 2, dl);
7292}
7293
7294SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
7295                                           SelectionDAG &DAG) const {
7296  EVT SrcVT = Op.getOperand(0).getValueType();
7297
7298  if (SrcVT.isVector())
7299    return SDValue();
7300
7301  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
7302         "Unknown SINT_TO_FP to lower!");
7303
7304  // These are really Legal; return the operand so the caller accepts it as
7305  // Legal.
7306  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
7307    return Op;
7308  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
7309      Subtarget->is64Bit()) {
7310    return Op;
7311  }
7312
7313  DebugLoc dl = Op.getDebugLoc();
7314  unsigned Size = SrcVT.getSizeInBits()/8;
7315  MachineFunction &MF = DAG.getMachineFunction();
7316  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
7317  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
7318  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
7319                               StackSlot,
7320                               MachinePointerInfo::getFixedStack(SSFI),
7321                               false, false, 0);
7322  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
7323}
7324
7325SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
7326                                     SDValue StackSlot,
7327                                     SelectionDAG &DAG) const {
7328  // Build the FILD
7329  DebugLoc DL = Op.getDebugLoc();
7330  SDVTList Tys;
7331  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
7332  if (useSSE)
7333    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
7334  else
7335    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
7336
7337  unsigned ByteSize = SrcVT.getSizeInBits()/8;
7338
7339  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
7340  MachineMemOperand *MMO;
7341  if (FI) {
7342    int SSFI = FI->getIndex();
7343    MMO =
7344      DAG.getMachineFunction()
7345      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
7346                            MachineMemOperand::MOLoad, ByteSize, ByteSize);
7347  } else {
7348    MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
7349    StackSlot = StackSlot.getOperand(1);
7350  }
7351  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
7352  SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
7353                                           X86ISD::FILD, DL,
7354                                           Tys, Ops, array_lengthof(Ops),
7355                                           SrcVT, MMO);
7356
7357  if (useSSE) {
7358    Chain = Result.getValue(1);
7359    SDValue InFlag = Result.getValue(2);
7360
7361    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
7362    // shouldn't be necessary except that RFP cannot be live across
7363    // multiple blocks. When stackifier is fixed, they can be uncoupled.
7364    MachineFunction &MF = DAG.getMachineFunction();
7365    unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
7366    int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
7367    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
7368    Tys = DAG.getVTList(MVT::Other);
7369    SDValue Ops[] = {
7370      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
7371    };
7372    MachineMemOperand *MMO =
7373      DAG.getMachineFunction()
7374      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
7375                            MachineMemOperand::MOStore, SSFISize, SSFISize);
7376
7377    Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
7378                                    Ops, array_lengthof(Ops),
7379                                    Op.getValueType(), MMO);
7380    Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
7381                         MachinePointerInfo::getFixedStack(SSFI),
7382                         false, false, 0);
7383  }
7384
7385  return Result;
7386}
7387
7388// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
7389SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
7390                                               SelectionDAG &DAG) const {
7391  // This algorithm is not obvious. Here it is in C code, more or less:
7392  /*
7393    double uint64_to_double( uint32_t hi, uint32_t lo ) {
7394      static const __m128i exp = { 0x4330000045300000ULL, 0 };
7395      static const __m128d bias = { 0x1.0p84, 0x1.0p52 };
7396
7397      // Copy ints to xmm registers.
7398      __m128i xh = _mm_cvtsi32_si128( hi );
7399      __m128i xl = _mm_cvtsi32_si128( lo );
7400
7401      // Combine into low half of a single xmm register.
7402      __m128i x = _mm_unpacklo_epi32( xh, xl );
7403      __m128d d;
7404      double sd;
7405
7406      // Merge in appropriate exponents to give the integer bits the right
7407      // magnitude.
7408      x = _mm_unpacklo_epi32( x, exp );
7409
7410      // Subtract away the biases to deal with the IEEE-754 double precision
7411      // implicit 1.
7412      d = _mm_sub_pd( (__m128d) x, bias );
7413
7414      // All conversions up to here are exact. The correctly rounded result is
7415      // calculated using the current rounding mode using the following
7416      // horizontal add.
7417      d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
7418      _mm_store_sd( &sd, d );   // Because we are returning doubles in XMM, this
7419                                // store doesn't really need to be here (except
7420                                // maybe to zero the other double)
7421      return sd;
7422    }
7423  */
7424
7425  DebugLoc dl = Op.getDebugLoc();
7426  LLVMContext *Context = DAG.getContext();
7427
7428  // Build some magic constants.
7429  std::vector<Constant*> CV0;
7430  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
7431  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
7432  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
7433  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
7434  Constant *C0 = ConstantVector::get(CV0);
7435  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
7436
7437  std::vector<Constant*> CV1;
7438  CV1.push_back(
7439    ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
7440  CV1.push_back(
7441    ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
7442  Constant *C1 = ConstantVector::get(CV1);
7443  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
7444
7445  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
7446                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
7447                                        Op.getOperand(0),
7448                                        DAG.getIntPtrConstant(1)));
7449  SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
7450                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
7451                                        Op.getOperand(0),
7452                                        DAG.getIntPtrConstant(0)));
7453  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
7454  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
7455                              MachinePointerInfo::getConstantPool(),
7456                              false, false, 16);
7457  SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
7458  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2);
7459  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
7460                              MachinePointerInfo::getConstantPool(),
7461                              false, false, 16);
7462  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
7463
7464  // Add the halves; easiest way is to swap them into another reg first.
7465  int ShufMask[2] = { 1, -1 };
7466  SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub,
7467                                      DAG.getUNDEF(MVT::v2f64), ShufMask);
7468  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
7469  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
7470                     DAG.getIntPtrConstant(0));
7471}
7472
7473// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
7474SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
7475                                               SelectionDAG &DAG) const {
7476  DebugLoc dl = Op.getDebugLoc();
7477  // FP constant to bias correct the final result.
7478  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
7479                                   MVT::f64);
7480
7481  // Load the 32-bit value into an XMM register.
7482  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
7483                             Op.getOperand(0));
7484
7485  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
7486                     DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
7487                     DAG.getIntPtrConstant(0));
7488
7489  // Or the load with the bias.
7490  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
7491                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
7492                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7493                                                   MVT::v2f64, Load)),
7494                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
7495                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7496                                                   MVT::v2f64, Bias)));
7497  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
7498                   DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
7499                   DAG.getIntPtrConstant(0));
7500
7501  // Subtract the bias.
7502  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
7503
7504  // Handle final rounding.
7505  EVT DestVT = Op.getValueType();
7506
7507  if (DestVT.bitsLT(MVT::f64)) {
7508    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
7509                       DAG.getIntPtrConstant(0));
7510  } else if (DestVT.bitsGT(MVT::f64)) {
7511    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
7512  }
7513
7514  // Handle final rounding.
7515  return Sub;
7516}
7517
7518SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
7519                                           SelectionDAG &DAG) const {
7520  SDValue N0 = Op.getOperand(0);
7521  DebugLoc dl = Op.getDebugLoc();
7522
7523  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
7524  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
7525  // the optimization here.
7526  if (DAG.SignBitIsZero(N0))
7527    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
7528
7529  EVT SrcVT = N0.getValueType();
7530  EVT DstVT = Op.getValueType();
7531  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
7532    return LowerUINT_TO_FP_i64(Op, DAG);
7533  else if (SrcVT == MVT::i32 && X86ScalarSSEf64)
7534    return LowerUINT_TO_FP_i32(Op, DAG);
7535
7536  // Make a 64-bit buffer, and use it to build an FILD.
7537  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
7538  if (SrcVT == MVT::i32) {
7539    SDValue WordOff = DAG.getConstant(4, getPointerTy());
7540    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
7541                                     getPointerTy(), StackSlot, WordOff);
7542    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
7543                                  StackSlot, MachinePointerInfo(),
7544                                  false, false, 0);
7545    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
7546                                  OffsetSlot, MachinePointerInfo(),
7547                                  false, false, 0);
7548    SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
7549    return Fild;
7550  }
7551
7552  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
7553  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
7554                                StackSlot, MachinePointerInfo(),
7555                               false, false, 0);
7556  // For i64 source, we need to add the appropriate power of 2 if the input
7557  // was negative.  This is the same as the optimization in
7558  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
7559  // we must be careful to do the computation in x87 extended precision, not
7560  // in SSE. (The generic code can't know it's OK to do this, or how to.)
7561  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
7562  MachineMemOperand *MMO =
7563    DAG.getMachineFunction()
7564    .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
7565                          MachineMemOperand::MOLoad, 8, 8);
7566
7567  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
7568  SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
7569  SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3,
7570                                         MVT::i64, MMO);
7571
7572  APInt FF(32, 0x5F800000ULL);
7573
7574  // Check whether the sign bit is set.
7575  SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
7576                                 Op.getOperand(0), DAG.getConstant(0, MVT::i64),
7577                                 ISD::SETLT);
7578
7579  // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
7580  SDValue FudgePtr = DAG.getConstantPool(
7581                             ConstantInt::get(*DAG.getContext(), FF.zext(64)),
7582                                         getPointerTy());
7583
7584  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
7585  SDValue Zero = DAG.getIntPtrConstant(0);
7586  SDValue Four = DAG.getIntPtrConstant(4);
7587  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
7588                               Zero, Four);
7589  FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
7590
7591  // Load the value out, extending it from f32 to f80.
7592  // FIXME: Avoid the extend by constructing the right constant pool?
7593  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
7594                                 FudgePtr, MachinePointerInfo::getConstantPool(),
7595                                 MVT::f32, false, false, 4);
7596  // Extend everything to 80 bits to force it to be done on x87.
7597  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
7598  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
7599}
7600
7601std::pair<SDValue,SDValue> X86TargetLowering::
7602FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const {
7603  DebugLoc DL = Op.getDebugLoc();
7604
7605  EVT DstTy = Op.getValueType();
7606
7607  if (!IsSigned) {
7608    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
7609    DstTy = MVT::i64;
7610  }
7611
7612  assert(DstTy.getSimpleVT() <= MVT::i64 &&
7613         DstTy.getSimpleVT() >= MVT::i16 &&
7614         "Unknown FP_TO_SINT to lower!");
7615
7616  // These are really Legal.
7617  if (DstTy == MVT::i32 &&
7618      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
7619    return std::make_pair(SDValue(), SDValue());
7620  if (Subtarget->is64Bit() &&
7621      DstTy == MVT::i64 &&
7622      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
7623    return std::make_pair(SDValue(), SDValue());
7624
7625  // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
7626  // stack slot.
7627  MachineFunction &MF = DAG.getMachineFunction();
7628  unsigned MemSize = DstTy.getSizeInBits()/8;
7629  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
7630  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
7631
7632
7633
7634  unsigned Opc;
7635  switch (DstTy.getSimpleVT().SimpleTy) {
7636  default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
7637  case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
7638  case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
7639  case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
7640  }
7641
7642  SDValue Chain = DAG.getEntryNode();
7643  SDValue Value = Op.getOperand(0);
7644  EVT TheVT = Op.getOperand(0).getValueType();
7645  if (isScalarFPTypeInSSEReg(TheVT)) {
7646    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
7647    Chain = DAG.getStore(Chain, DL, Value, StackSlot,
7648                         MachinePointerInfo::getFixedStack(SSFI),
7649                         false, false, 0);
7650    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
7651    SDValue Ops[] = {
7652      Chain, StackSlot, DAG.getValueType(TheVT)
7653    };
7654
7655    MachineMemOperand *MMO =
7656      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
7657                              MachineMemOperand::MOLoad, MemSize, MemSize);
7658    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3,
7659                                    DstTy, MMO);
7660    Chain = Value.getValue(1);
7661    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
7662    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
7663  }
7664
7665  MachineMemOperand *MMO =
7666    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
7667                            MachineMemOperand::MOStore, MemSize, MemSize);
7668
7669  // Build the FP_TO_INT*_IN_MEM
7670  SDValue Ops[] = { Chain, Value, StackSlot };
7671  SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
7672                                         Ops, 3, DstTy, MMO);
7673
7674  return std::make_pair(FIST, StackSlot);
7675}
7676
7677SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
7678                                           SelectionDAG &DAG) const {
7679  if (Op.getValueType().isVector())
7680    return SDValue();
7681
7682  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true);
7683  SDValue FIST = Vals.first, StackSlot = Vals.second;
7684  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
7685  if (FIST.getNode() == 0) return Op;
7686
7687  // Load the result.
7688  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
7689                     FIST, StackSlot, MachinePointerInfo(), false, false, 0);
7690}
7691
7692SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
7693                                           SelectionDAG &DAG) const {
7694  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false);
7695  SDValue FIST = Vals.first, StackSlot = Vals.second;
7696  assert(FIST.getNode() && "Unexpected failure");
7697
7698  // Load the result.
7699  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
7700                     FIST, StackSlot, MachinePointerInfo(), false, false, 0);
7701}
7702
7703SDValue X86TargetLowering::LowerFABS(SDValue Op,
7704                                     SelectionDAG &DAG) const {
7705  LLVMContext *Context = DAG.getContext();
7706  DebugLoc dl = Op.getDebugLoc();
7707  EVT VT = Op.getValueType();
7708  EVT EltVT = VT;
7709  if (VT.isVector())
7710    EltVT = VT.getVectorElementType();
7711  std::vector<Constant*> CV;
7712  if (EltVT == MVT::f64) {
7713    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
7714    CV.push_back(C);
7715    CV.push_back(C);
7716  } else {
7717    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
7718    CV.push_back(C);
7719    CV.push_back(C);
7720    CV.push_back(C);
7721    CV.push_back(C);
7722  }
7723  Constant *C = ConstantVector::get(CV);
7724  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
7725  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
7726                             MachinePointerInfo::getConstantPool(),
7727                             false, false, 16);
7728  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
7729}
7730
7731SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
7732  LLVMContext *Context = DAG.getContext();
7733  DebugLoc dl = Op.getDebugLoc();
7734  EVT VT = Op.getValueType();
7735  EVT EltVT = VT;
7736  if (VT.isVector())
7737    EltVT = VT.getVectorElementType();
7738  std::vector<Constant*> CV;
7739  if (EltVT == MVT::f64) {
7740    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
7741    CV.push_back(C);
7742    CV.push_back(C);
7743  } else {
7744    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
7745    CV.push_back(C);
7746    CV.push_back(C);
7747    CV.push_back(C);
7748    CV.push_back(C);
7749  }
7750  Constant *C = ConstantVector::get(CV);
7751  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
7752  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
7753                             MachinePointerInfo::getConstantPool(),
7754                             false, false, 16);
7755  if (VT.isVector()) {
7756    return DAG.getNode(ISD::BITCAST, dl, VT,
7757                       DAG.getNode(ISD::XOR, dl, MVT::v2i64,
7758                    DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
7759                                Op.getOperand(0)),
7760                    DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask)));
7761  } else {
7762    return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
7763  }
7764}
7765
7766SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7767  LLVMContext *Context = DAG.getContext();
7768  SDValue Op0 = Op.getOperand(0);
7769  SDValue Op1 = Op.getOperand(1);
7770  DebugLoc dl = Op.getDebugLoc();
7771  EVT VT = Op.getValueType();
7772  EVT SrcVT = Op1.getValueType();
7773
7774  // If second operand is smaller, extend it first.
7775  if (SrcVT.bitsLT(VT)) {
7776    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
7777    SrcVT = VT;
7778  }
7779  // And if it is bigger, shrink it first.
7780  if (SrcVT.bitsGT(VT)) {
7781    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
7782    SrcVT = VT;
7783  }
7784
7785  // At this point the operands and the result should have the same
7786  // type, and that won't be f80 since that is not custom lowered.
7787
7788  // First get the sign bit of second operand.
7789  std::vector<Constant*> CV;
7790  if (SrcVT == MVT::f64) {
7791    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
7792    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
7793  } else {
7794    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
7795    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
7796    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
7797    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
7798  }
7799  Constant *C = ConstantVector::get(CV);
7800  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
7801  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
7802                              MachinePointerInfo::getConstantPool(),
7803                              false, false, 16);
7804  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
7805
7806  // Shift sign bit right or left if the two operands have different types.
7807  if (SrcVT.bitsGT(VT)) {
7808    // Op0 is MVT::f32, Op1 is MVT::f64.
7809    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
7810    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
7811                          DAG.getConstant(32, MVT::i32));
7812    SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
7813    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
7814                          DAG.getIntPtrConstant(0));
7815  }
7816
7817  // Clear first operand sign bit.
7818  CV.clear();
7819  if (VT == MVT::f64) {
7820    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
7821    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
7822  } else {
7823    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
7824    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
7825    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
7826    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
7827  }
7828  C = ConstantVector::get(CV);
7829  CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
7830  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
7831                              MachinePointerInfo::getConstantPool(),
7832                              false, false, 16);
7833  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
7834
7835  // Or the value with the sign bit.
7836  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
7837}
7838
7839SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const {
7840  SDValue N0 = Op.getOperand(0);
7841  DebugLoc dl = Op.getDebugLoc();
7842  EVT VT = Op.getValueType();
7843
7844  // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
7845  SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
7846                                  DAG.getConstant(1, VT));
7847  return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
7848}
7849
7850/// Emit nodes that will be selected as "test Op0,Op0", or something
7851/// equivalent.
7852SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
7853                                    SelectionDAG &DAG) const {
7854  DebugLoc dl = Op.getDebugLoc();
7855
7856  // CF and OF aren't always set the way we want. Determine which
7857  // of these we need.
7858  bool NeedCF = false;
7859  bool NeedOF = false;
7860  switch (X86CC) {
7861  default: break;
7862  case X86::COND_A: case X86::COND_AE:
7863  case X86::COND_B: case X86::COND_BE:
7864    NeedCF = true;
7865    break;
7866  case X86::COND_G: case X86::COND_GE:
7867  case X86::COND_L: case X86::COND_LE:
7868  case X86::COND_O: case X86::COND_NO:
7869    NeedOF = true;
7870    break;
7871  }
7872
7873  // See if we can use the EFLAGS value from the operand instead of
7874  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
7875  // we prove that the arithmetic won't overflow, we can't use OF or CF.
7876  if (Op.getResNo() != 0 || NeedOF || NeedCF)
7877    // Emit a CMP with 0, which is the TEST pattern.
7878    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
7879                       DAG.getConstant(0, Op.getValueType()));
7880
7881  unsigned Opcode = 0;
7882  unsigned NumOperands = 0;
7883  switch (Op.getNode()->getOpcode()) {
7884  case ISD::ADD:
7885    // Due to an isel shortcoming, be conservative if this add is likely to be
7886    // selected as part of a load-modify-store instruction. When the root node
7887    // in a match is a store, isel doesn't know how to remap non-chain non-flag
7888    // uses of other nodes in the match, such as the ADD in this case. This
7889    // leads to the ADD being left around and reselected, with the result being
7890    // two adds in the output.  Alas, even if none our users are stores, that
7891    // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
7892    // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
7893    // climbing the DAG back to the root, and it doesn't seem to be worth the
7894    // effort.
7895    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
7896           UE = Op.getNode()->use_end(); UI != UE; ++UI)
7897      if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC)
7898        goto default_case;
7899
7900    if (ConstantSDNode *C =
7901        dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
7902      // An add of one will be selected as an INC.
7903      if (C->getAPIntValue() == 1) {
7904        Opcode = X86ISD::INC;
7905        NumOperands = 1;
7906        break;
7907      }
7908
7909      // An add of negative one (subtract of one) will be selected as a DEC.
7910      if (C->getAPIntValue().isAllOnesValue()) {
7911        Opcode = X86ISD::DEC;
7912        NumOperands = 1;
7913        break;
7914      }
7915    }
7916
7917    // Otherwise use a regular EFLAGS-setting add.
7918    Opcode = X86ISD::ADD;
7919    NumOperands = 2;
7920    break;
7921  case ISD::AND: {
7922    // If the primary and result isn't used, don't bother using X86ISD::AND,
7923    // because a TEST instruction will be better.
7924    bool NonFlagUse = false;
7925    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
7926           UE = Op.getNode()->use_end(); UI != UE; ++UI) {
7927      SDNode *User = *UI;
7928      unsigned UOpNo = UI.getOperandNo();
7929      if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
7930        // Look pass truncate.
7931        UOpNo = User->use_begin().getOperandNo();
7932        User = *User->use_begin();
7933      }
7934
7935      if (User->getOpcode() != ISD::BRCOND &&
7936          User->getOpcode() != ISD::SETCC &&
7937          (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
7938        NonFlagUse = true;
7939        break;
7940      }
7941    }
7942
7943    if (!NonFlagUse)
7944      break;
7945  }
7946    // FALL THROUGH
7947  case ISD::SUB:
7948  case ISD::OR:
7949  case ISD::XOR:
7950    // Due to the ISEL shortcoming noted above, be conservative if this op is
7951    // likely to be selected as part of a load-modify-store instruction.
7952    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
7953           UE = Op.getNode()->use_end(); UI != UE; ++UI)
7954      if (UI->getOpcode() == ISD::STORE)
7955        goto default_case;
7956
7957    // Otherwise use a regular EFLAGS-setting instruction.
7958    switch (Op.getNode()->getOpcode()) {
7959    default: llvm_unreachable("unexpected operator!");
7960    case ISD::SUB: Opcode = X86ISD::SUB; break;
7961    case ISD::OR:  Opcode = X86ISD::OR;  break;
7962    case ISD::XOR: Opcode = X86ISD::XOR; break;
7963    case ISD::AND: Opcode = X86ISD::AND; break;
7964    }
7965
7966    NumOperands = 2;
7967    break;
7968  case X86ISD::ADD:
7969  case X86ISD::SUB:
7970  case X86ISD::INC:
7971  case X86ISD::DEC:
7972  case X86ISD::OR:
7973  case X86ISD::XOR:
7974  case X86ISD::AND:
7975    return SDValue(Op.getNode(), 1);
7976  default:
7977  default_case:
7978    break;
7979  }
7980
7981  if (Opcode == 0)
7982    // Emit a CMP with 0, which is the TEST pattern.
7983    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
7984                       DAG.getConstant(0, Op.getValueType()));
7985
7986  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
7987  SmallVector<SDValue, 4> Ops;
7988  for (unsigned i = 0; i != NumOperands; ++i)
7989    Ops.push_back(Op.getOperand(i));
7990
7991  SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
7992  DAG.ReplaceAllUsesWith(Op, New);
7993  return SDValue(New.getNode(), 1);
7994}
7995
7996/// Emit nodes that will be selected as "cmp Op0,Op1", or something
7997/// equivalent.
7998SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
7999                                   SelectionDAG &DAG) const {
8000  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
8001    if (C->getAPIntValue() == 0)
8002      return EmitTest(Op0, X86CC, DAG);
8003
8004  DebugLoc dl = Op0.getDebugLoc();
8005  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
8006}
8007
8008/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
8009/// if it's possible.
8010SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
8011                                     DebugLoc dl, SelectionDAG &DAG) const {
8012  SDValue Op0 = And.getOperand(0);
8013  SDValue Op1 = And.getOperand(1);
8014  if (Op0.getOpcode() == ISD::TRUNCATE)
8015    Op0 = Op0.getOperand(0);
8016  if (Op1.getOpcode() == ISD::TRUNCATE)
8017    Op1 = Op1.getOperand(0);
8018
8019  SDValue LHS, RHS;
8020  if (Op1.getOpcode() == ISD::SHL)
8021    std::swap(Op0, Op1);
8022  if (Op0.getOpcode() == ISD::SHL) {
8023    if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
8024      if (And00C->getZExtValue() == 1) {
8025        // If we looked past a truncate, check that it's only truncating away
8026        // known zeros.
8027        unsigned BitWidth = Op0.getValueSizeInBits();
8028        unsigned AndBitWidth = And.getValueSizeInBits();
8029        if (BitWidth > AndBitWidth) {
8030          APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones;
8031          DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones);
8032          if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
8033            return SDValue();
8034        }
8035        LHS = Op1;
8036        RHS = Op0.getOperand(1);
8037      }
8038  } else if (Op1.getOpcode() == ISD::Constant) {
8039    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
8040    SDValue AndLHS = Op0;
8041    if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
8042      LHS = AndLHS.getOperand(0);
8043      RHS = AndLHS.getOperand(1);
8044    }
8045  }
8046
8047  if (LHS.getNode()) {
8048    // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
8049    // instruction.  Since the shift amount is in-range-or-undefined, we know
8050    // that doing a bittest on the i32 value is ok.  We extend to i32 because
8051    // the encoding for the i16 version is larger than the i32 version.
8052    // Also promote i16 to i32 for performance / code size reason.
8053    if (LHS.getValueType() == MVT::i8 ||
8054        LHS.getValueType() == MVT::i16)
8055      LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
8056
8057    // If the operand types disagree, extend the shift amount to match.  Since
8058    // BT ignores high bits (like shifts) we can use anyextend.
8059    if (LHS.getValueType() != RHS.getValueType())
8060      RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
8061
8062    SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
8063    unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
8064    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
8065                       DAG.getConstant(Cond, MVT::i8), BT);
8066  }
8067
8068  return SDValue();
8069}
8070
8071SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
8072  assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
8073  SDValue Op0 = Op.getOperand(0);
8074  SDValue Op1 = Op.getOperand(1);
8075  DebugLoc dl = Op.getDebugLoc();
8076  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8077
8078  // Optimize to BT if possible.
8079  // Lower (X & (1 << N)) == 0 to BT(X, N).
8080  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
8081  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
8082  if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
8083      Op1.getOpcode() == ISD::Constant &&
8084      cast<ConstantSDNode>(Op1)->isNullValue() &&
8085      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
8086    SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
8087    if (NewSetCC.getNode())
8088      return NewSetCC;
8089  }
8090
8091  // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
8092  // these.
8093  if (Op1.getOpcode() == ISD::Constant &&
8094      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
8095       cast<ConstantSDNode>(Op1)->isNullValue()) &&
8096      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
8097
8098    // If the input is a setcc, then reuse the input setcc or use a new one with
8099    // the inverted condition.
8100    if (Op0.getOpcode() == X86ISD::SETCC) {
8101      X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
8102      bool Invert = (CC == ISD::SETNE) ^
8103        cast<ConstantSDNode>(Op1)->isNullValue();
8104      if (!Invert) return Op0;
8105
8106      CCode = X86::GetOppositeBranchCondition(CCode);
8107      return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
8108                         DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
8109    }
8110  }
8111
8112  bool isFP = Op1.getValueType().isFloatingPoint();
8113  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
8114  if (X86CC == X86::COND_INVALID)
8115    return SDValue();
8116
8117  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
8118  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
8119                     DAG.getConstant(X86CC, MVT::i8), EFLAGS);
8120}
8121
8122SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
8123  SDValue Cond;
8124  SDValue Op0 = Op.getOperand(0);
8125  SDValue Op1 = Op.getOperand(1);
8126  SDValue CC = Op.getOperand(2);
8127  EVT VT = Op.getValueType();
8128  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
8129  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
8130  DebugLoc dl = Op.getDebugLoc();
8131
8132  if (isFP) {
8133    unsigned SSECC = 8;
8134    EVT EltVT = Op0.getValueType().getVectorElementType();
8135    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
8136
8137    unsigned Opc = EltVT == MVT::f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
8138    bool Swap = false;
8139
8140    switch (SetCCOpcode) {
8141    default: break;
8142    case ISD::SETOEQ:
8143    case ISD::SETEQ:  SSECC = 0; break;
8144    case ISD::SETOGT:
8145    case ISD::SETGT: Swap = true; // Fallthrough
8146    case ISD::SETLT:
8147    case ISD::SETOLT: SSECC = 1; break;
8148    case ISD::SETOGE:
8149    case ISD::SETGE: Swap = true; // Fallthrough
8150    case ISD::SETLE:
8151    case ISD::SETOLE: SSECC = 2; break;
8152    case ISD::SETUO:  SSECC = 3; break;
8153    case ISD::SETUNE:
8154    case ISD::SETNE:  SSECC = 4; break;
8155    case ISD::SETULE: Swap = true;
8156    case ISD::SETUGE: SSECC = 5; break;
8157    case ISD::SETULT: Swap = true;
8158    case ISD::SETUGT: SSECC = 6; break;
8159    case ISD::SETO:   SSECC = 7; break;
8160    }
8161    if (Swap)
8162      std::swap(Op0, Op1);
8163
8164    // In the two special cases we can't handle, emit two comparisons.
8165    if (SSECC == 8) {
8166      if (SetCCOpcode == ISD::SETUEQ) {
8167        SDValue UNORD, EQ;
8168        UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
8169        EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
8170        return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
8171      }
8172      else if (SetCCOpcode == ISD::SETONE) {
8173        SDValue ORD, NEQ;
8174        ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
8175        NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
8176        return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
8177      }
8178      llvm_unreachable("Illegal FP comparison");
8179    }
8180    // Handle all other FP comparisons here.
8181    return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
8182  }
8183
8184  if (!isFP && VT.getSizeInBits() == 256)
8185    return SDValue();
8186
8187  // We are handling one of the integer comparisons here.  Since SSE only has
8188  // GT and EQ comparisons for integer, swapping operands and multiple
8189  // operations may be required for some comparisons.
8190  unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
8191  bool Swap = false, Invert = false, FlipSigns = false;
8192
8193  switch (VT.getSimpleVT().SimpleTy) {
8194  default: break;
8195  case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
8196  case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
8197  case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
8198  case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
8199  }
8200
8201  switch (SetCCOpcode) {
8202  default: break;
8203  case ISD::SETNE:  Invert = true;
8204  case ISD::SETEQ:  Opc = EQOpc; break;
8205  case ISD::SETLT:  Swap = true;
8206  case ISD::SETGT:  Opc = GTOpc; break;
8207  case ISD::SETGE:  Swap = true;
8208  case ISD::SETLE:  Opc = GTOpc; Invert = true; break;
8209  case ISD::SETULT: Swap = true;
8210  case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break;
8211  case ISD::SETUGE: Swap = true;
8212  case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break;
8213  }
8214  if (Swap)
8215    std::swap(Op0, Op1);
8216
8217  // Since SSE has no unsigned integer comparisons, we need to flip  the sign
8218  // bits of the inputs before performing those operations.
8219  if (FlipSigns) {
8220    EVT EltVT = VT.getVectorElementType();
8221    SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
8222                                      EltVT);
8223    std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
8224    SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
8225                                    SignBits.size());
8226    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
8227    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
8228  }
8229
8230  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
8231
8232  // If the logical-not of the result is required, perform that now.
8233  if (Invert)
8234    Result = DAG.getNOT(dl, Result, VT);
8235
8236  return Result;
8237}
8238
8239// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
8240static bool isX86LogicalCmp(SDValue Op) {
8241  unsigned Opc = Op.getNode()->getOpcode();
8242  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
8243    return true;
8244  if (Op.getResNo() == 1 &&
8245      (Opc == X86ISD::ADD ||
8246       Opc == X86ISD::SUB ||
8247       Opc == X86ISD::ADC ||
8248       Opc == X86ISD::SBB ||
8249       Opc == X86ISD::SMUL ||
8250       Opc == X86ISD::UMUL ||
8251       Opc == X86ISD::INC ||
8252       Opc == X86ISD::DEC ||
8253       Opc == X86ISD::OR ||
8254       Opc == X86ISD::XOR ||
8255       Opc == X86ISD::AND))
8256    return true;
8257
8258  if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
8259    return true;
8260
8261  return false;
8262}
8263
8264static bool isZero(SDValue V) {
8265  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
8266  return C && C->isNullValue();
8267}
8268
8269static bool isAllOnes(SDValue V) {
8270  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
8271  return C && C->isAllOnesValue();
8272}
8273
8274SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
8275  bool addTest = true;
8276  SDValue Cond  = Op.getOperand(0);
8277  SDValue Op1 = Op.getOperand(1);
8278  SDValue Op2 = Op.getOperand(2);
8279  DebugLoc DL = Op.getDebugLoc();
8280  SDValue CC;
8281
8282  if (Cond.getOpcode() == ISD::SETCC) {
8283    SDValue NewCond = LowerSETCC(Cond, DAG);
8284    if (NewCond.getNode())
8285      Cond = NewCond;
8286  }
8287
8288  // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
8289  // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
8290  // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
8291  // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
8292  if (Cond.getOpcode() == X86ISD::SETCC &&
8293      Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
8294      isZero(Cond.getOperand(1).getOperand(1))) {
8295    SDValue Cmp = Cond.getOperand(1);
8296
8297    unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
8298
8299    if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
8300        (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
8301      SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
8302
8303      SDValue CmpOp0 = Cmp.getOperand(0);
8304      Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
8305                        CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
8306
8307      SDValue Res =   // Res = 0 or -1.
8308        DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
8309                    DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
8310
8311      if (isAllOnes(Op1) != (CondCode == X86::COND_E))
8312        Res = DAG.getNOT(DL, Res, Res.getValueType());
8313
8314      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
8315      if (N2C == 0 || !N2C->isNullValue())
8316        Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
8317      return Res;
8318    }
8319  }
8320
8321  // Look past (and (setcc_carry (cmp ...)), 1).
8322  if (Cond.getOpcode() == ISD::AND &&
8323      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
8324    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
8325    if (C && C->getAPIntValue() == 1)
8326      Cond = Cond.getOperand(0);
8327  }
8328
8329  // If condition flag is set by a X86ISD::CMP, then use it as the condition
8330  // setting operand in place of the X86ISD::SETCC.
8331  if (Cond.getOpcode() == X86ISD::SETCC ||
8332      Cond.getOpcode() == X86ISD::SETCC_CARRY) {
8333    CC = Cond.getOperand(0);
8334
8335    SDValue Cmp = Cond.getOperand(1);
8336    unsigned Opc = Cmp.getOpcode();
8337    EVT VT = Op.getValueType();
8338
8339    bool IllegalFPCMov = false;
8340    if (VT.isFloatingPoint() && !VT.isVector() &&
8341        !isScalarFPTypeInSSEReg(VT))  // FPStack?
8342      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
8343
8344    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
8345        Opc == X86ISD::BT) { // FIXME
8346      Cond = Cmp;
8347      addTest = false;
8348    }
8349  }
8350
8351  if (addTest) {
8352    // Look pass the truncate.
8353    if (Cond.getOpcode() == ISD::TRUNCATE)
8354      Cond = Cond.getOperand(0);
8355
8356    // We know the result of AND is compared against zero. Try to match
8357    // it to BT.
8358    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
8359      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
8360      if (NewSetCC.getNode()) {
8361        CC = NewSetCC.getOperand(0);
8362        Cond = NewSetCC.getOperand(1);
8363        addTest = false;
8364      }
8365    }
8366  }
8367
8368  if (addTest) {
8369    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
8370    Cond = EmitTest(Cond, X86::COND_NE, DAG);
8371  }
8372
8373  // a <  b ? -1 :  0 -> RES = ~setcc_carry
8374  // a <  b ?  0 : -1 -> RES = setcc_carry
8375  // a >= b ? -1 :  0 -> RES = setcc_carry
8376  // a >= b ?  0 : -1 -> RES = ~setcc_carry
8377  if (Cond.getOpcode() == X86ISD::CMP) {
8378    unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
8379
8380    if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
8381        (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
8382      SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
8383                                DAG.getConstant(X86::COND_B, MVT::i8), Cond);
8384      if (isAllOnes(Op1) != (CondCode == X86::COND_B))
8385        return DAG.getNOT(DL, Res, Res.getValueType());
8386      return Res;
8387    }
8388  }
8389
8390  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
8391  // condition is true.
8392  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
8393  SDValue Ops[] = { Op2, Op1, CC, Cond };
8394  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
8395}
8396
8397// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
8398// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
8399// from the AND / OR.
8400static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
8401  Opc = Op.getOpcode();
8402  if (Opc != ISD::OR && Opc != ISD::AND)
8403    return false;
8404  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
8405          Op.getOperand(0).hasOneUse() &&
8406          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
8407          Op.getOperand(1).hasOneUse());
8408}
8409
8410// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
8411// 1 and that the SETCC node has a single use.
8412static bool isXor1OfSetCC(SDValue Op) {
8413  if (Op.getOpcode() != ISD::XOR)
8414    return false;
8415  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
8416  if (N1C && N1C->getAPIntValue() == 1) {
8417    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
8418      Op.getOperand(0).hasOneUse();
8419  }
8420  return false;
8421}
8422
8423SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
8424  bool addTest = true;
8425  SDValue Chain = Op.getOperand(0);
8426  SDValue Cond  = Op.getOperand(1);
8427  SDValue Dest  = Op.getOperand(2);
8428  DebugLoc dl = Op.getDebugLoc();
8429  SDValue CC;
8430
8431  if (Cond.getOpcode() == ISD::SETCC) {
8432    SDValue NewCond = LowerSETCC(Cond, DAG);
8433    if (NewCond.getNode())
8434      Cond = NewCond;
8435  }
8436#if 0
8437  // FIXME: LowerXALUO doesn't handle these!!
8438  else if (Cond.getOpcode() == X86ISD::ADD  ||
8439           Cond.getOpcode() == X86ISD::SUB  ||
8440           Cond.getOpcode() == X86ISD::SMUL ||
8441           Cond.getOpcode() == X86ISD::UMUL)
8442    Cond = LowerXALUO(Cond, DAG);
8443#endif
8444
8445  // Look pass (and (setcc_carry (cmp ...)), 1).
8446  if (Cond.getOpcode() == ISD::AND &&
8447      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
8448    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
8449    if (C && C->getAPIntValue() == 1)
8450      Cond = Cond.getOperand(0);
8451  }
8452
8453  // If condition flag is set by a X86ISD::CMP, then use it as the condition
8454  // setting operand in place of the X86ISD::SETCC.
8455  if (Cond.getOpcode() == X86ISD::SETCC ||
8456      Cond.getOpcode() == X86ISD::SETCC_CARRY) {
8457    CC = Cond.getOperand(0);
8458
8459    SDValue Cmp = Cond.getOperand(1);
8460    unsigned Opc = Cmp.getOpcode();
8461    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
8462    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
8463      Cond = Cmp;
8464      addTest = false;
8465    } else {
8466      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
8467      default: break;
8468      case X86::COND_O:
8469      case X86::COND_B:
8470        // These can only come from an arithmetic instruction with overflow,
8471        // e.g. SADDO, UADDO.
8472        Cond = Cond.getNode()->getOperand(1);
8473        addTest = false;
8474        break;
8475      }
8476    }
8477  } else {
8478    unsigned CondOpc;
8479    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
8480      SDValue Cmp = Cond.getOperand(0).getOperand(1);
8481      if (CondOpc == ISD::OR) {
8482        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
8483        // two branches instead of an explicit OR instruction with a
8484        // separate test.
8485        if (Cmp == Cond.getOperand(1).getOperand(1) &&
8486            isX86LogicalCmp(Cmp)) {
8487          CC = Cond.getOperand(0).getOperand(0);
8488          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
8489                              Chain, Dest, CC, Cmp);
8490          CC = Cond.getOperand(1).getOperand(0);
8491          Cond = Cmp;
8492          addTest = false;
8493        }
8494      } else { // ISD::AND
8495        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
8496        // two branches instead of an explicit AND instruction with a
8497        // separate test. However, we only do this if this block doesn't
8498        // have a fall-through edge, because this requires an explicit
8499        // jmp when the condition is false.
8500        if (Cmp == Cond.getOperand(1).getOperand(1) &&
8501            isX86LogicalCmp(Cmp) &&
8502            Op.getNode()->hasOneUse()) {
8503          X86::CondCode CCode =
8504            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
8505          CCode = X86::GetOppositeBranchCondition(CCode);
8506          CC = DAG.getConstant(CCode, MVT::i8);
8507          SDNode *User = *Op.getNode()->use_begin();
8508          // Look for an unconditional branch following this conditional branch.
8509          // We need this because we need to reverse the successors in order
8510          // to implement FCMP_OEQ.
8511          if (User->getOpcode() == ISD::BR) {
8512            SDValue FalseBB = User->getOperand(1);
8513            SDNode *NewBR =
8514              DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
8515            assert(NewBR == User);
8516            (void)NewBR;
8517            Dest = FalseBB;
8518
8519            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
8520                                Chain, Dest, CC, Cmp);
8521            X86::CondCode CCode =
8522              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
8523            CCode = X86::GetOppositeBranchCondition(CCode);
8524            CC = DAG.getConstant(CCode, MVT::i8);
8525            Cond = Cmp;
8526            addTest = false;
8527          }
8528        }
8529      }
8530    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
8531      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
8532      // It should be transformed during dag combiner except when the condition
8533      // is set by a arithmetics with overflow node.
8534      X86::CondCode CCode =
8535        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
8536      CCode = X86::GetOppositeBranchCondition(CCode);
8537      CC = DAG.getConstant(CCode, MVT::i8);
8538      Cond = Cond.getOperand(0).getOperand(1);
8539      addTest = false;
8540    }
8541  }
8542
8543  if (addTest) {
8544    // Look pass the truncate.
8545    if (Cond.getOpcode() == ISD::TRUNCATE)
8546      Cond = Cond.getOperand(0);
8547
8548    // We know the result of AND is compared against zero. Try to match
8549    // it to BT.
8550    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
8551      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
8552      if (NewSetCC.getNode()) {
8553        CC = NewSetCC.getOperand(0);
8554        Cond = NewSetCC.getOperand(1);
8555        addTest = false;
8556      }
8557    }
8558  }
8559
8560  if (addTest) {
8561    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
8562    Cond = EmitTest(Cond, X86::COND_NE, DAG);
8563  }
8564  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
8565                     Chain, Dest, CC, Cond);
8566}
8567
8568
8569// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
8570// Calls to _alloca is needed to probe the stack when allocating more than 4k
8571// bytes in one go. Touching the stack at 4K increments is necessary to ensure
8572// that the guard pages used by the OS virtual memory manager are allocated in
8573// correct sequence.
8574SDValue
8575X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
8576                                           SelectionDAG &DAG) const {
8577  assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) &&
8578         "This should be used only on Windows targets");
8579  assert(!Subtarget->isTargetEnvMacho());
8580  DebugLoc dl = Op.getDebugLoc();
8581
8582  // Get the inputs.
8583  SDValue Chain = Op.getOperand(0);
8584  SDValue Size  = Op.getOperand(1);
8585  // FIXME: Ensure alignment here
8586
8587  SDValue Flag;
8588
8589  EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
8590  unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
8591
8592  Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
8593  Flag = Chain.getValue(1);
8594
8595  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8596
8597  Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
8598  Flag = Chain.getValue(1);
8599
8600  Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
8601
8602  SDValue Ops1[2] = { Chain.getValue(0), Chain };
8603  return DAG.getMergeValues(Ops1, 2, dl);
8604}
8605
8606SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
8607  MachineFunction &MF = DAG.getMachineFunction();
8608  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
8609
8610  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
8611  DebugLoc DL = Op.getDebugLoc();
8612
8613  if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
8614    // vastart just stores the address of the VarArgsFrameIndex slot into the
8615    // memory location argument.
8616    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
8617                                   getPointerTy());
8618    return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
8619                        MachinePointerInfo(SV), false, false, 0);
8620  }
8621
8622  // __va_list_tag:
8623  //   gp_offset         (0 - 6 * 8)
8624  //   fp_offset         (48 - 48 + 8 * 16)
8625  //   overflow_arg_area (point to parameters coming in memory).
8626  //   reg_save_area
8627  SmallVector<SDValue, 8> MemOps;
8628  SDValue FIN = Op.getOperand(1);
8629  // Store gp_offset
8630  SDValue Store = DAG.getStore(Op.getOperand(0), DL,
8631                               DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
8632                                               MVT::i32),
8633                               FIN, MachinePointerInfo(SV), false, false, 0);
8634  MemOps.push_back(Store);
8635
8636  // Store fp_offset
8637  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
8638                    FIN, DAG.getIntPtrConstant(4));
8639  Store = DAG.getStore(Op.getOperand(0), DL,
8640                       DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
8641                                       MVT::i32),
8642                       FIN, MachinePointerInfo(SV, 4), false, false, 0);
8643  MemOps.push_back(Store);
8644
8645  // Store ptr to overflow_arg_area
8646  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
8647                    FIN, DAG.getIntPtrConstant(4));
8648  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
8649                                    getPointerTy());
8650  Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
8651                       MachinePointerInfo(SV, 8),
8652                       false, false, 0);
8653  MemOps.push_back(Store);
8654
8655  // Store ptr to reg_save_area.
8656  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
8657                    FIN, DAG.getIntPtrConstant(8));
8658  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
8659                                    getPointerTy());
8660  Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
8661                       MachinePointerInfo(SV, 16), false, false, 0);
8662  MemOps.push_back(Store);
8663  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
8664                     &MemOps[0], MemOps.size());
8665}
8666
8667SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
8668  assert(Subtarget->is64Bit() &&
8669         "LowerVAARG only handles 64-bit va_arg!");
8670  assert((Subtarget->isTargetLinux() ||
8671          Subtarget->isTargetDarwin()) &&
8672          "Unhandled target in LowerVAARG");
8673  assert(Op.getNode()->getNumOperands() == 4);
8674  SDValue Chain = Op.getOperand(0);
8675  SDValue SrcPtr = Op.getOperand(1);
8676  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
8677  unsigned Align = Op.getConstantOperandVal(3);
8678  DebugLoc dl = Op.getDebugLoc();
8679
8680  EVT ArgVT = Op.getNode()->getValueType(0);
8681  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
8682  uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy);
8683  uint8_t ArgMode;
8684
8685  // Decide which area this value should be read from.
8686  // TODO: Implement the AMD64 ABI in its entirety. This simple
8687  // selection mechanism works only for the basic types.
8688  if (ArgVT == MVT::f80) {
8689    llvm_unreachable("va_arg for f80 not yet implemented");
8690  } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
8691    ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
8692  } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
8693    ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
8694  } else {
8695    llvm_unreachable("Unhandled argument type in LowerVAARG");
8696  }
8697
8698  if (ArgMode == 2) {
8699    // Sanity Check: Make sure using fp_offset makes sense.
8700    assert(!UseSoftFloat &&
8701           !(DAG.getMachineFunction()
8702                .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) &&
8703           Subtarget->hasXMM());
8704  }
8705
8706  // Insert VAARG_64 node into the DAG
8707  // VAARG_64 returns two values: Variable Argument Address, Chain
8708  SmallVector<SDValue, 11> InstOps;
8709  InstOps.push_back(Chain);
8710  InstOps.push_back(SrcPtr);
8711  InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
8712  InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
8713  InstOps.push_back(DAG.getConstant(Align, MVT::i32));
8714  SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
8715  SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
8716                                          VTs, &InstOps[0], InstOps.size(),
8717                                          MVT::i64,
8718                                          MachinePointerInfo(SV),
8719                                          /*Align=*/0,
8720                                          /*Volatile=*/false,
8721                                          /*ReadMem=*/true,
8722                                          /*WriteMem=*/true);
8723  Chain = VAARG.getValue(1);
8724
8725  // Load the next argument and return it
8726  return DAG.getLoad(ArgVT, dl,
8727                     Chain,
8728                     VAARG,
8729                     MachinePointerInfo(),
8730                     false, false, 0);
8731}
8732
8733SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
8734  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
8735  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
8736  SDValue Chain = Op.getOperand(0);
8737  SDValue DstPtr = Op.getOperand(1);
8738  SDValue SrcPtr = Op.getOperand(2);
8739  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
8740  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
8741  DebugLoc DL = Op.getDebugLoc();
8742
8743  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
8744                       DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
8745                       false,
8746                       MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
8747}
8748
8749SDValue
8750X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
8751  DebugLoc dl = Op.getDebugLoc();
8752  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
8753  switch (IntNo) {
8754  default: return SDValue();    // Don't custom lower most intrinsics.
8755  // Comparison intrinsics.
8756  case Intrinsic::x86_sse_comieq_ss:
8757  case Intrinsic::x86_sse_comilt_ss:
8758  case Intrinsic::x86_sse_comile_ss:
8759  case Intrinsic::x86_sse_comigt_ss:
8760  case Intrinsic::x86_sse_comige_ss:
8761  case Intrinsic::x86_sse_comineq_ss:
8762  case Intrinsic::x86_sse_ucomieq_ss:
8763  case Intrinsic::x86_sse_ucomilt_ss:
8764  case Intrinsic::x86_sse_ucomile_ss:
8765  case Intrinsic::x86_sse_ucomigt_ss:
8766  case Intrinsic::x86_sse_ucomige_ss:
8767  case Intrinsic::x86_sse_ucomineq_ss:
8768  case Intrinsic::x86_sse2_comieq_sd:
8769  case Intrinsic::x86_sse2_comilt_sd:
8770  case Intrinsic::x86_sse2_comile_sd:
8771  case Intrinsic::x86_sse2_comigt_sd:
8772  case Intrinsic::x86_sse2_comige_sd:
8773  case Intrinsic::x86_sse2_comineq_sd:
8774  case Intrinsic::x86_sse2_ucomieq_sd:
8775  case Intrinsic::x86_sse2_ucomilt_sd:
8776  case Intrinsic::x86_sse2_ucomile_sd:
8777  case Intrinsic::x86_sse2_ucomigt_sd:
8778  case Intrinsic::x86_sse2_ucomige_sd:
8779  case Intrinsic::x86_sse2_ucomineq_sd: {
8780    unsigned Opc = 0;
8781    ISD::CondCode CC = ISD::SETCC_INVALID;
8782    switch (IntNo) {
8783    default: break;
8784    case Intrinsic::x86_sse_comieq_ss:
8785    case Intrinsic::x86_sse2_comieq_sd:
8786      Opc = X86ISD::COMI;
8787      CC = ISD::SETEQ;
8788      break;
8789    case Intrinsic::x86_sse_comilt_ss:
8790    case Intrinsic::x86_sse2_comilt_sd:
8791      Opc = X86ISD::COMI;
8792      CC = ISD::SETLT;
8793      break;
8794    case Intrinsic::x86_sse_comile_ss:
8795    case Intrinsic::x86_sse2_comile_sd:
8796      Opc = X86ISD::COMI;
8797      CC = ISD::SETLE;
8798      break;
8799    case Intrinsic::x86_sse_comigt_ss:
8800    case Intrinsic::x86_sse2_comigt_sd:
8801      Opc = X86ISD::COMI;
8802      CC = ISD::SETGT;
8803      break;
8804    case Intrinsic::x86_sse_comige_ss:
8805    case Intrinsic::x86_sse2_comige_sd:
8806      Opc = X86ISD::COMI;
8807      CC = ISD::SETGE;
8808      break;
8809    case Intrinsic::x86_sse_comineq_ss:
8810    case Intrinsic::x86_sse2_comineq_sd:
8811      Opc = X86ISD::COMI;
8812      CC = ISD::SETNE;
8813      break;
8814    case Intrinsic::x86_sse_ucomieq_ss:
8815    case Intrinsic::x86_sse2_ucomieq_sd:
8816      Opc = X86ISD::UCOMI;
8817      CC = ISD::SETEQ;
8818      break;
8819    case Intrinsic::x86_sse_ucomilt_ss:
8820    case Intrinsic::x86_sse2_ucomilt_sd:
8821      Opc = X86ISD::UCOMI;
8822      CC = ISD::SETLT;
8823      break;
8824    case Intrinsic::x86_sse_ucomile_ss:
8825    case Intrinsic::x86_sse2_ucomile_sd:
8826      Opc = X86ISD::UCOMI;
8827      CC = ISD::SETLE;
8828      break;
8829    case Intrinsic::x86_sse_ucomigt_ss:
8830    case Intrinsic::x86_sse2_ucomigt_sd:
8831      Opc = X86ISD::UCOMI;
8832      CC = ISD::SETGT;
8833      break;
8834    case Intrinsic::x86_sse_ucomige_ss:
8835    case Intrinsic::x86_sse2_ucomige_sd:
8836      Opc = X86ISD::UCOMI;
8837      CC = ISD::SETGE;
8838      break;
8839    case Intrinsic::x86_sse_ucomineq_ss:
8840    case Intrinsic::x86_sse2_ucomineq_sd:
8841      Opc = X86ISD::UCOMI;
8842      CC = ISD::SETNE;
8843      break;
8844    }
8845
8846    SDValue LHS = Op.getOperand(1);
8847    SDValue RHS = Op.getOperand(2);
8848    unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
8849    assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
8850    SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
8851    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
8852                                DAG.getConstant(X86CC, MVT::i8), Cond);
8853    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
8854  }
8855  // ptest and testp intrinsics. The intrinsic these come from are designed to
8856  // return an integer value, not just an instruction so lower it to the ptest
8857  // or testp pattern and a setcc for the result.
8858  case Intrinsic::x86_sse41_ptestz:
8859  case Intrinsic::x86_sse41_ptestc:
8860  case Intrinsic::x86_sse41_ptestnzc:
8861  case Intrinsic::x86_avx_ptestz_256:
8862  case Intrinsic::x86_avx_ptestc_256:
8863  case Intrinsic::x86_avx_ptestnzc_256:
8864  case Intrinsic::x86_avx_vtestz_ps:
8865  case Intrinsic::x86_avx_vtestc_ps:
8866  case Intrinsic::x86_avx_vtestnzc_ps:
8867  case Intrinsic::x86_avx_vtestz_pd:
8868  case Intrinsic::x86_avx_vtestc_pd:
8869  case Intrinsic::x86_avx_vtestnzc_pd:
8870  case Intrinsic::x86_avx_vtestz_ps_256:
8871  case Intrinsic::x86_avx_vtestc_ps_256:
8872  case Intrinsic::x86_avx_vtestnzc_ps_256:
8873  case Intrinsic::x86_avx_vtestz_pd_256:
8874  case Intrinsic::x86_avx_vtestc_pd_256:
8875  case Intrinsic::x86_avx_vtestnzc_pd_256: {
8876    bool IsTestPacked = false;
8877    unsigned X86CC = 0;
8878    switch (IntNo) {
8879    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
8880    case Intrinsic::x86_avx_vtestz_ps:
8881    case Intrinsic::x86_avx_vtestz_pd:
8882    case Intrinsic::x86_avx_vtestz_ps_256:
8883    case Intrinsic::x86_avx_vtestz_pd_256:
8884      IsTestPacked = true; // Fallthrough
8885    case Intrinsic::x86_sse41_ptestz:
8886    case Intrinsic::x86_avx_ptestz_256:
8887      // ZF = 1
8888      X86CC = X86::COND_E;
8889      break;
8890    case Intrinsic::x86_avx_vtestc_ps:
8891    case Intrinsic::x86_avx_vtestc_pd:
8892    case Intrinsic::x86_avx_vtestc_ps_256:
8893    case Intrinsic::x86_avx_vtestc_pd_256:
8894      IsTestPacked = true; // Fallthrough
8895    case Intrinsic::x86_sse41_ptestc:
8896    case Intrinsic::x86_avx_ptestc_256:
8897      // CF = 1
8898      X86CC = X86::COND_B;
8899      break;
8900    case Intrinsic::x86_avx_vtestnzc_ps:
8901    case Intrinsic::x86_avx_vtestnzc_pd:
8902    case Intrinsic::x86_avx_vtestnzc_ps_256:
8903    case Intrinsic::x86_avx_vtestnzc_pd_256:
8904      IsTestPacked = true; // Fallthrough
8905    case Intrinsic::x86_sse41_ptestnzc:
8906    case Intrinsic::x86_avx_ptestnzc_256:
8907      // ZF and CF = 0
8908      X86CC = X86::COND_A;
8909      break;
8910    }
8911
8912    SDValue LHS = Op.getOperand(1);
8913    SDValue RHS = Op.getOperand(2);
8914    unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
8915    SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
8916    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
8917    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
8918    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
8919  }
8920
8921  // Fix vector shift instructions where the last operand is a non-immediate
8922  // i32 value.
8923  case Intrinsic::x86_sse2_pslli_w:
8924  case Intrinsic::x86_sse2_pslli_d:
8925  case Intrinsic::x86_sse2_pslli_q:
8926  case Intrinsic::x86_sse2_psrli_w:
8927  case Intrinsic::x86_sse2_psrli_d:
8928  case Intrinsic::x86_sse2_psrli_q:
8929  case Intrinsic::x86_sse2_psrai_w:
8930  case Intrinsic::x86_sse2_psrai_d:
8931  case Intrinsic::x86_mmx_pslli_w:
8932  case Intrinsic::x86_mmx_pslli_d:
8933  case Intrinsic::x86_mmx_pslli_q:
8934  case Intrinsic::x86_mmx_psrli_w:
8935  case Intrinsic::x86_mmx_psrli_d:
8936  case Intrinsic::x86_mmx_psrli_q:
8937  case Intrinsic::x86_mmx_psrai_w:
8938  case Intrinsic::x86_mmx_psrai_d: {
8939    SDValue ShAmt = Op.getOperand(2);
8940    if (isa<ConstantSDNode>(ShAmt))
8941      return SDValue();
8942
8943    unsigned NewIntNo = 0;
8944    EVT ShAmtVT = MVT::v4i32;
8945    switch (IntNo) {
8946    case Intrinsic::x86_sse2_pslli_w:
8947      NewIntNo = Intrinsic::x86_sse2_psll_w;
8948      break;
8949    case Intrinsic::x86_sse2_pslli_d:
8950      NewIntNo = Intrinsic::x86_sse2_psll_d;
8951      break;
8952    case Intrinsic::x86_sse2_pslli_q:
8953      NewIntNo = Intrinsic::x86_sse2_psll_q;
8954      break;
8955    case Intrinsic::x86_sse2_psrli_w:
8956      NewIntNo = Intrinsic::x86_sse2_psrl_w;
8957      break;
8958    case Intrinsic::x86_sse2_psrli_d:
8959      NewIntNo = Intrinsic::x86_sse2_psrl_d;
8960      break;
8961    case Intrinsic::x86_sse2_psrli_q:
8962      NewIntNo = Intrinsic::x86_sse2_psrl_q;
8963      break;
8964    case Intrinsic::x86_sse2_psrai_w:
8965      NewIntNo = Intrinsic::x86_sse2_psra_w;
8966      break;
8967    case Intrinsic::x86_sse2_psrai_d:
8968      NewIntNo = Intrinsic::x86_sse2_psra_d;
8969      break;
8970    default: {
8971      ShAmtVT = MVT::v2i32;
8972      switch (IntNo) {
8973      case Intrinsic::x86_mmx_pslli_w:
8974        NewIntNo = Intrinsic::x86_mmx_psll_w;
8975        break;
8976      case Intrinsic::x86_mmx_pslli_d:
8977        NewIntNo = Intrinsic::x86_mmx_psll_d;
8978        break;
8979      case Intrinsic::x86_mmx_pslli_q:
8980        NewIntNo = Intrinsic::x86_mmx_psll_q;
8981        break;
8982      case Intrinsic::x86_mmx_psrli_w:
8983        NewIntNo = Intrinsic::x86_mmx_psrl_w;
8984        break;
8985      case Intrinsic::x86_mmx_psrli_d:
8986        NewIntNo = Intrinsic::x86_mmx_psrl_d;
8987        break;
8988      case Intrinsic::x86_mmx_psrli_q:
8989        NewIntNo = Intrinsic::x86_mmx_psrl_q;
8990        break;
8991      case Intrinsic::x86_mmx_psrai_w:
8992        NewIntNo = Intrinsic::x86_mmx_psra_w;
8993        break;
8994      case Intrinsic::x86_mmx_psrai_d:
8995        NewIntNo = Intrinsic::x86_mmx_psra_d;
8996        break;
8997      default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
8998      }
8999      break;
9000    }
9001    }
9002
9003    // The vector shift intrinsics with scalars uses 32b shift amounts but
9004    // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
9005    // to be zero.
9006    SDValue ShOps[4];
9007    ShOps[0] = ShAmt;
9008    ShOps[1] = DAG.getConstant(0, MVT::i32);
9009    if (ShAmtVT == MVT::v4i32) {
9010      ShOps[2] = DAG.getUNDEF(MVT::i32);
9011      ShOps[3] = DAG.getUNDEF(MVT::i32);
9012      ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4);
9013    } else {
9014      ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2);
9015// FIXME this must be lowered to get rid of the invalid type.
9016    }
9017
9018    EVT VT = Op.getValueType();
9019    ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
9020    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9021                       DAG.getConstant(NewIntNo, MVT::i32),
9022                       Op.getOperand(1), ShAmt);
9023  }
9024  }
9025}
9026
9027SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
9028                                           SelectionDAG &DAG) const {
9029  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
9030  MFI->setReturnAddressIsTaken(true);
9031
9032  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
9033  DebugLoc dl = Op.getDebugLoc();
9034
9035  if (Depth > 0) {
9036    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
9037    SDValue Offset =
9038      DAG.getConstant(TD->getPointerSize(),
9039                      Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
9040    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
9041                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
9042                                   FrameAddr, Offset),
9043                       MachinePointerInfo(), false, false, 0);
9044  }
9045
9046  // Just load the return address.
9047  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
9048  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
9049                     RetAddrFI, MachinePointerInfo(), false, false, 0);
9050}
9051
9052SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
9053  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
9054  MFI->setFrameAddressIsTaken(true);
9055
9056  EVT VT = Op.getValueType();
9057  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
9058  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
9059  unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
9060  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
9061  while (Depth--)
9062    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
9063                            MachinePointerInfo(),
9064                            false, false, 0);
9065  return FrameAddr;
9066}
9067
9068SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
9069                                                     SelectionDAG &DAG) const {
9070  return DAG.getIntPtrConstant(2*TD->getPointerSize());
9071}
9072
9073SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
9074  MachineFunction &MF = DAG.getMachineFunction();
9075  SDValue Chain     = Op.getOperand(0);
9076  SDValue Offset    = Op.getOperand(1);
9077  SDValue Handler   = Op.getOperand(2);
9078  DebugLoc dl       = Op.getDebugLoc();
9079
9080  SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
9081                                     Subtarget->is64Bit() ? X86::RBP : X86::EBP,
9082                                     getPointerTy());
9083  unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
9084
9085  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame,
9086                                  DAG.getIntPtrConstant(TD->getPointerSize()));
9087  StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
9088  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
9089                       false, false, 0);
9090  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
9091  MF.getRegInfo().addLiveOut(StoreAddrReg);
9092
9093  return DAG.getNode(X86ISD::EH_RETURN, dl,
9094                     MVT::Other,
9095                     Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
9096}
9097
9098SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
9099                                             SelectionDAG &DAG) const {
9100  SDValue Root = Op.getOperand(0);
9101  SDValue Trmp = Op.getOperand(1); // trampoline
9102  SDValue FPtr = Op.getOperand(2); // nested function
9103  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
9104  DebugLoc dl  = Op.getDebugLoc();
9105
9106  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
9107
9108  if (Subtarget->is64Bit()) {
9109    SDValue OutChains[6];
9110
9111    // Large code-model.
9112    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
9113    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
9114
9115    const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10);
9116    const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11);
9117
9118    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
9119
9120    // Load the pointer to the nested function into R11.
9121    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
9122    SDValue Addr = Trmp;
9123    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
9124                                Addr, MachinePointerInfo(TrmpAddr),
9125                                false, false, 0);
9126
9127    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
9128                       DAG.getConstant(2, MVT::i64));
9129    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
9130                                MachinePointerInfo(TrmpAddr, 2),
9131                                false, false, 2);
9132
9133    // Load the 'nest' parameter value into R10.
9134    // R10 is specified in X86CallingConv.td
9135    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
9136    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
9137                       DAG.getConstant(10, MVT::i64));
9138    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
9139                                Addr, MachinePointerInfo(TrmpAddr, 10),
9140                                false, false, 0);
9141
9142    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
9143                       DAG.getConstant(12, MVT::i64));
9144    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
9145                                MachinePointerInfo(TrmpAddr, 12),
9146                                false, false, 2);
9147
9148    // Jump to the nested function.
9149    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
9150    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
9151                       DAG.getConstant(20, MVT::i64));
9152    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
9153                                Addr, MachinePointerInfo(TrmpAddr, 20),
9154                                false, false, 0);
9155
9156    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
9157    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
9158                       DAG.getConstant(22, MVT::i64));
9159    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
9160                                MachinePointerInfo(TrmpAddr, 22),
9161                                false, false, 0);
9162
9163    SDValue Ops[] =
9164      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) };
9165    return DAG.getMergeValues(Ops, 2, dl);
9166  } else {
9167    const Function *Func =
9168      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
9169    CallingConv::ID CC = Func->getCallingConv();
9170    unsigned NestReg;
9171
9172    switch (CC) {
9173    default:
9174      llvm_unreachable("Unsupported calling convention");
9175    case CallingConv::C:
9176    case CallingConv::X86_StdCall: {
9177      // Pass 'nest' parameter in ECX.
9178      // Must be kept in sync with X86CallingConv.td
9179      NestReg = X86::ECX;
9180
9181      // Check that ECX wasn't needed by an 'inreg' parameter.
9182      FunctionType *FTy = Func->getFunctionType();
9183      const AttrListPtr &Attrs = Func->getAttributes();
9184
9185      if (!Attrs.isEmpty() && !Func->isVarArg()) {
9186        unsigned InRegCount = 0;
9187        unsigned Idx = 1;
9188
9189        for (FunctionType::param_iterator I = FTy->param_begin(),
9190             E = FTy->param_end(); I != E; ++I, ++Idx)
9191          if (Attrs.paramHasAttr(Idx, Attribute::InReg))
9192            // FIXME: should only count parameters that are lowered to integers.
9193            InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
9194
9195        if (InRegCount > 2) {
9196          report_fatal_error("Nest register in use - reduce number of inreg"
9197                             " parameters!");
9198        }
9199      }
9200      break;
9201    }
9202    case CallingConv::X86_FastCall:
9203    case CallingConv::X86_ThisCall:
9204    case CallingConv::Fast:
9205      // Pass 'nest' parameter in EAX.
9206      // Must be kept in sync with X86CallingConv.td
9207      NestReg = X86::EAX;
9208      break;
9209    }
9210
9211    SDValue OutChains[4];
9212    SDValue Addr, Disp;
9213
9214    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
9215                       DAG.getConstant(10, MVT::i32));
9216    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
9217
9218    // This is storing the opcode for MOV32ri.
9219    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
9220    const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg);
9221    OutChains[0] = DAG.getStore(Root, dl,
9222                                DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
9223                                Trmp, MachinePointerInfo(TrmpAddr),
9224                                false, false, 0);
9225
9226    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
9227                       DAG.getConstant(1, MVT::i32));
9228    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
9229                                MachinePointerInfo(TrmpAddr, 1),
9230                                false, false, 1);
9231
9232    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
9233    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
9234                       DAG.getConstant(5, MVT::i32));
9235    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
9236                                MachinePointerInfo(TrmpAddr, 5),
9237                                false, false, 1);
9238
9239    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
9240                       DAG.getConstant(6, MVT::i32));
9241    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
9242                                MachinePointerInfo(TrmpAddr, 6),
9243                                false, false, 1);
9244
9245    SDValue Ops[] =
9246      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) };
9247    return DAG.getMergeValues(Ops, 2, dl);
9248  }
9249}
9250
9251SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
9252                                            SelectionDAG &DAG) const {
9253  /*
9254   The rounding mode is in bits 11:10 of FPSR, and has the following
9255   settings:
9256     00 Round to nearest
9257     01 Round to -inf
9258     10 Round to +inf
9259     11 Round to 0
9260
9261  FLT_ROUNDS, on the other hand, expects the following:
9262    -1 Undefined
9263     0 Round to 0
9264     1 Round to nearest
9265     2 Round to +inf
9266     3 Round to -inf
9267
9268  To perform the conversion, we do:
9269    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
9270  */
9271
9272  MachineFunction &MF = DAG.getMachineFunction();
9273  const TargetMachine &TM = MF.getTarget();
9274  const TargetFrameLowering &TFI = *TM.getFrameLowering();
9275  unsigned StackAlignment = TFI.getStackAlignment();
9276  EVT VT = Op.getValueType();
9277  DebugLoc DL = Op.getDebugLoc();
9278
9279  // Save FP Control Word to stack slot
9280  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
9281  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
9282
9283
9284  MachineMemOperand *MMO =
9285   MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
9286                           MachineMemOperand::MOStore, 2, 2);
9287
9288  SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
9289  SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
9290                                          DAG.getVTList(MVT::Other),
9291                                          Ops, 2, MVT::i16, MMO);
9292
9293  // Load FP Control Word from stack slot
9294  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
9295                            MachinePointerInfo(), false, false, 0);
9296
9297  // Transform as necessary
9298  SDValue CWD1 =
9299    DAG.getNode(ISD::SRL, DL, MVT::i16,
9300                DAG.getNode(ISD::AND, DL, MVT::i16,
9301                            CWD, DAG.getConstant(0x800, MVT::i16)),
9302                DAG.getConstant(11, MVT::i8));
9303  SDValue CWD2 =
9304    DAG.getNode(ISD::SRL, DL, MVT::i16,
9305                DAG.getNode(ISD::AND, DL, MVT::i16,
9306                            CWD, DAG.getConstant(0x400, MVT::i16)),
9307                DAG.getConstant(9, MVT::i8));
9308
9309  SDValue RetVal =
9310    DAG.getNode(ISD::AND, DL, MVT::i16,
9311                DAG.getNode(ISD::ADD, DL, MVT::i16,
9312                            DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
9313                            DAG.getConstant(1, MVT::i16)),
9314                DAG.getConstant(3, MVT::i16));
9315
9316
9317  return DAG.getNode((VT.getSizeInBits() < 16 ?
9318                      ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
9319}
9320
9321SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
9322  EVT VT = Op.getValueType();
9323  EVT OpVT = VT;
9324  unsigned NumBits = VT.getSizeInBits();
9325  DebugLoc dl = Op.getDebugLoc();
9326
9327  Op = Op.getOperand(0);
9328  if (VT == MVT::i8) {
9329    // Zero extend to i32 since there is not an i8 bsr.
9330    OpVT = MVT::i32;
9331    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
9332  }
9333
9334  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
9335  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
9336  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
9337
9338  // If src is zero (i.e. bsr sets ZF), returns NumBits.
9339  SDValue Ops[] = {
9340    Op,
9341    DAG.getConstant(NumBits+NumBits-1, OpVT),
9342    DAG.getConstant(X86::COND_E, MVT::i8),
9343    Op.getValue(1)
9344  };
9345  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
9346
9347  // Finally xor with NumBits-1.
9348  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
9349
9350  if (VT == MVT::i8)
9351    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
9352  return Op;
9353}
9354
9355SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
9356  EVT VT = Op.getValueType();
9357  EVT OpVT = VT;
9358  unsigned NumBits = VT.getSizeInBits();
9359  DebugLoc dl = Op.getDebugLoc();
9360
9361  Op = Op.getOperand(0);
9362  if (VT == MVT::i8) {
9363    OpVT = MVT::i32;
9364    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
9365  }
9366
9367  // Issue a bsf (scan bits forward) which also sets EFLAGS.
9368  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
9369  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
9370
9371  // If src is zero (i.e. bsf sets ZF), returns NumBits.
9372  SDValue Ops[] = {
9373    Op,
9374    DAG.getConstant(NumBits, OpVT),
9375    DAG.getConstant(X86::COND_E, MVT::i8),
9376    Op.getValue(1)
9377  };
9378  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
9379
9380  if (VT == MVT::i8)
9381    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
9382  return Op;
9383}
9384
9385SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const {
9386  EVT VT = Op.getValueType();
9387  assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
9388  DebugLoc dl = Op.getDebugLoc();
9389
9390  //  ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
9391  //  ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
9392  //  ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
9393  //  ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
9394  //  ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
9395  //
9396  //  AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
9397  //  AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
9398  //  return AloBlo + AloBhi + AhiBlo;
9399
9400  SDValue A = Op.getOperand(0);
9401  SDValue B = Op.getOperand(1);
9402
9403  SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9404                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
9405                       A, DAG.getConstant(32, MVT::i32));
9406  SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9407                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
9408                       B, DAG.getConstant(32, MVT::i32));
9409  SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9410                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
9411                       A, B);
9412  SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9413                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
9414                       A, Bhi);
9415  SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9416                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
9417                       Ahi, B);
9418  AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9419                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
9420                       AloBhi, DAG.getConstant(32, MVT::i32));
9421  AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9422                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
9423                       AhiBlo, DAG.getConstant(32, MVT::i32));
9424  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
9425  Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
9426  return Res;
9427}
9428
9429SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
9430
9431  EVT VT = Op.getValueType();
9432  DebugLoc dl = Op.getDebugLoc();
9433  SDValue R = Op.getOperand(0);
9434  SDValue Amt = Op.getOperand(1);
9435  LLVMContext *Context = DAG.getContext();
9436
9437  if (!(Subtarget->hasSSE2() || Subtarget->hasAVX()))
9438    return SDValue();
9439
9440  // Decompose 256-bit shifts into smaller 128-bit shifts.
9441  if (VT.getSizeInBits() == 256) {
9442    int NumElems = VT.getVectorNumElements();
9443    MVT EltVT = VT.getVectorElementType().getSimpleVT();
9444    EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
9445
9446    // Extract the two vectors
9447    SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl);
9448    SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32),
9449                                     DAG, dl);
9450
9451    // Recreate the shift amount vectors
9452    SmallVector<SDValue, 4> Amt1Csts;
9453    SmallVector<SDValue, 4> Amt2Csts;
9454    for (int i = 0; i < NumElems/2; ++i)
9455      Amt1Csts.push_back(Amt->getOperand(i));
9456    for (int i = NumElems/2; i < NumElems; ++i)
9457      Amt2Csts.push_back(Amt->getOperand(i));
9458
9459    SDValue Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
9460                               &Amt1Csts[0], NumElems/2);
9461    SDValue Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
9462                               &Amt2Csts[0], NumElems/2);
9463
9464    // Issue new vector shifts for the smaller types
9465    V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
9466    V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
9467
9468    // Concatenate the result back
9469    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
9470  }
9471
9472  // Optimize shl/srl/sra with constant shift amount.
9473  if (isSplatVector(Amt.getNode())) {
9474    SDValue SclrAmt = Amt->getOperand(0);
9475    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
9476      uint64_t ShiftAmt = C->getZExtValue();
9477
9478      if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL)
9479       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9480                     DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
9481                     R, DAG.getConstant(ShiftAmt, MVT::i32));
9482
9483      if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL)
9484       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9485                     DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
9486                     R, DAG.getConstant(ShiftAmt, MVT::i32));
9487
9488      if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL)
9489       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9490                     DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
9491                     R, DAG.getConstant(ShiftAmt, MVT::i32));
9492
9493      if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL)
9494       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9495                     DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
9496                     R, DAG.getConstant(ShiftAmt, MVT::i32));
9497
9498      if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL)
9499       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9500                     DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
9501                     R, DAG.getConstant(ShiftAmt, MVT::i32));
9502
9503      if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL)
9504       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9505                     DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
9506                     R, DAG.getConstant(ShiftAmt, MVT::i32));
9507
9508      if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA)
9509       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9510                     DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
9511                     R, DAG.getConstant(ShiftAmt, MVT::i32));
9512
9513      if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA)
9514       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9515                     DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
9516                     R, DAG.getConstant(ShiftAmt, MVT::i32));
9517    }
9518  }
9519
9520  // Lower SHL with variable shift amount.
9521  if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
9522    Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9523                     DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
9524                     Op.getOperand(1), DAG.getConstant(23, MVT::i32));
9525
9526    ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U));
9527
9528    std::vector<Constant*> CV(4, CI);
9529    Constant *C = ConstantVector::get(CV);
9530    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
9531    SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
9532                                 MachinePointerInfo::getConstantPool(),
9533                                 false, false, 16);
9534
9535    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend);
9536    Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
9537    Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
9538    return DAG.getNode(ISD::MUL, dl, VT, Op, R);
9539  }
9540  if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
9541    // a = a << 5;
9542    Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9543                     DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
9544                     Op.getOperand(1), DAG.getConstant(5, MVT::i32));
9545
9546    ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15));
9547    ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63));
9548
9549    std::vector<Constant*> CVM1(16, CM1);
9550    std::vector<Constant*> CVM2(16, CM2);
9551    Constant *C = ConstantVector::get(CVM1);
9552    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
9553    SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
9554                            MachinePointerInfo::getConstantPool(),
9555                            false, false, 16);
9556
9557    // r = pblendv(r, psllw(r & (char16)15, 4), a);
9558    M = DAG.getNode(ISD::AND, dl, VT, R, M);
9559    M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9560                    DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
9561                    DAG.getConstant(4, MVT::i32));
9562    R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op);
9563    // a += a
9564    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
9565
9566    C = ConstantVector::get(CVM2);
9567    CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
9568    M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
9569                    MachinePointerInfo::getConstantPool(),
9570                    false, false, 16);
9571
9572    // r = pblendv(r, psllw(r & (char16)63, 2), a);
9573    M = DAG.getNode(ISD::AND, dl, VT, R, M);
9574    M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9575                    DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
9576                    DAG.getConstant(2, MVT::i32));
9577    R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op);
9578    // a += a
9579    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
9580
9581    // return pblendv(r, r+r, a);
9582    R = DAG.getNode(X86ISD::PBLENDVB, dl, VT,
9583                    R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op);
9584    return R;
9585  }
9586  return SDValue();
9587}
9588
9589SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
9590  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
9591  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
9592  // looks for this combo and may remove the "setcc" instruction if the "setcc"
9593  // has only one use.
9594  SDNode *N = Op.getNode();
9595  SDValue LHS = N->getOperand(0);
9596  SDValue RHS = N->getOperand(1);
9597  unsigned BaseOp = 0;
9598  unsigned Cond = 0;
9599  DebugLoc DL = Op.getDebugLoc();
9600  switch (Op.getOpcode()) {
9601  default: llvm_unreachable("Unknown ovf instruction!");
9602  case ISD::SADDO:
9603    // A subtract of one will be selected as a INC. Note that INC doesn't
9604    // set CF, so we can't do this for UADDO.
9605    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
9606      if (C->isOne()) {
9607        BaseOp = X86ISD::INC;
9608        Cond = X86::COND_O;
9609        break;
9610      }
9611    BaseOp = X86ISD::ADD;
9612    Cond = X86::COND_O;
9613    break;
9614  case ISD::UADDO:
9615    BaseOp = X86ISD::ADD;
9616    Cond = X86::COND_B;
9617    break;
9618  case ISD::SSUBO:
9619    // A subtract of one will be selected as a DEC. Note that DEC doesn't
9620    // set CF, so we can't do this for USUBO.
9621    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
9622      if (C->isOne()) {
9623        BaseOp = X86ISD::DEC;
9624        Cond = X86::COND_O;
9625        break;
9626      }
9627    BaseOp = X86ISD::SUB;
9628    Cond = X86::COND_O;
9629    break;
9630  case ISD::USUBO:
9631    BaseOp = X86ISD::SUB;
9632    Cond = X86::COND_B;
9633    break;
9634  case ISD::SMULO:
9635    BaseOp = X86ISD::SMUL;
9636    Cond = X86::COND_O;
9637    break;
9638  case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
9639    SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
9640                                 MVT::i32);
9641    SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
9642
9643    SDValue SetCC =
9644      DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
9645                  DAG.getConstant(X86::COND_O, MVT::i32),
9646                  SDValue(Sum.getNode(), 2));
9647
9648    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
9649  }
9650  }
9651
9652  // Also sets EFLAGS.
9653  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
9654  SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
9655
9656  SDValue SetCC =
9657    DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
9658                DAG.getConstant(Cond, MVT::i32),
9659                SDValue(Sum.getNode(), 1));
9660
9661  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
9662}
9663
9664SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{
9665  DebugLoc dl = Op.getDebugLoc();
9666  SDNode* Node = Op.getNode();
9667  EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
9668  EVT VT = Node->getValueType(0);
9669
9670  if (Subtarget->hasSSE2() && VT.isVector()) {
9671    unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
9672                        ExtraVT.getScalarType().getSizeInBits();
9673    SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
9674
9675    unsigned SHLIntrinsicsID = 0;
9676    unsigned SRAIntrinsicsID = 0;
9677    switch (VT.getSimpleVT().SimpleTy) {
9678      default:
9679        return SDValue();
9680      case MVT::v2i64: {
9681        SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_q;
9682        SRAIntrinsicsID = 0;
9683        break;
9684      }
9685      case MVT::v4i32: {
9686        SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d;
9687        SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d;
9688        break;
9689      }
9690      case MVT::v8i16: {
9691        SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_w;
9692        SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_w;
9693        break;
9694      }
9695    }
9696
9697    SDValue Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9698                         DAG.getConstant(SHLIntrinsicsID, MVT::i32),
9699                         Node->getOperand(0), ShAmt);
9700
9701    // In case of 1 bit sext, no need to shr
9702    if (ExtraVT.getScalarType().getSizeInBits() == 1) return Tmp1;
9703
9704    if (SRAIntrinsicsID) {
9705      Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
9706                         DAG.getConstant(SRAIntrinsicsID, MVT::i32),
9707                         Tmp1, ShAmt);
9708    }
9709    return Tmp1;
9710  }
9711
9712  return SDValue();
9713}
9714
9715
9716SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
9717  DebugLoc dl = Op.getDebugLoc();
9718
9719  // Go ahead and emit the fence on x86-64 even if we asked for no-sse2.
9720  // There isn't any reason to disable it if the target processor supports it.
9721  if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) {
9722    SDValue Chain = Op.getOperand(0);
9723    SDValue Zero = DAG.getConstant(0, MVT::i32);
9724    SDValue Ops[] = {
9725      DAG.getRegister(X86::ESP, MVT::i32), // Base
9726      DAG.getTargetConstant(1, MVT::i8),   // Scale
9727      DAG.getRegister(0, MVT::i32),        // Index
9728      DAG.getTargetConstant(0, MVT::i32),  // Disp
9729      DAG.getRegister(0, MVT::i32),        // Segment.
9730      Zero,
9731      Chain
9732    };
9733    SDNode *Res =
9734      DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
9735                          array_lengthof(Ops));
9736    return SDValue(Res, 0);
9737  }
9738
9739  unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
9740  if (!isDev)
9741    return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
9742
9743  unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
9744  unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
9745  unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
9746  unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
9747
9748  // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
9749  if (!Op1 && !Op2 && !Op3 && Op4)
9750    return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0));
9751
9752  // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
9753  if (Op1 && !Op2 && !Op3 && !Op4)
9754    return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0));
9755
9756  // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)),
9757  //           (MFENCE)>;
9758  return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
9759}
9760
9761SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op,
9762                                             SelectionDAG &DAG) const {
9763  DebugLoc dl = Op.getDebugLoc();
9764  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
9765    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
9766  SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
9767    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
9768
9769  // The only fence that needs an instruction is a sequentially-consistent
9770  // cross-thread fence.
9771  if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
9772    // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
9773    // no-sse2). There isn't any reason to disable it if the target processor
9774    // supports it.
9775    if (Subtarget->hasSSE2() || Subtarget->is64Bit())
9776      return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
9777
9778    SDValue Chain = Op.getOperand(0);
9779    SDValue Zero = DAG.getConstant(0, MVT::i32);
9780    SDValue Ops[] = {
9781      DAG.getRegister(X86::ESP, MVT::i32), // Base
9782      DAG.getTargetConstant(1, MVT::i8),   // Scale
9783      DAG.getRegister(0, MVT::i32),        // Index
9784      DAG.getTargetConstant(0, MVT::i32),  // Disp
9785      DAG.getRegister(0, MVT::i32),        // Segment.
9786      Zero,
9787      Chain
9788    };
9789    SDNode *Res =
9790      DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
9791                         array_lengthof(Ops));
9792    return SDValue(Res, 0);
9793  }
9794
9795  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
9796  return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
9797}
9798
9799
9800SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
9801  EVT T = Op.getValueType();
9802  DebugLoc DL = Op.getDebugLoc();
9803  unsigned Reg = 0;
9804  unsigned size = 0;
9805  switch(T.getSimpleVT().SimpleTy) {
9806  default:
9807    assert(false && "Invalid value type!");
9808  case MVT::i8:  Reg = X86::AL;  size = 1; break;
9809  case MVT::i16: Reg = X86::AX;  size = 2; break;
9810  case MVT::i32: Reg = X86::EAX; size = 4; break;
9811  case MVT::i64:
9812    assert(Subtarget->is64Bit() && "Node not type legal!");
9813    Reg = X86::RAX; size = 8;
9814    break;
9815  }
9816  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
9817                                    Op.getOperand(2), SDValue());
9818  SDValue Ops[] = { cpIn.getValue(0),
9819                    Op.getOperand(1),
9820                    Op.getOperand(3),
9821                    DAG.getTargetConstant(size, MVT::i8),
9822                    cpIn.getValue(1) };
9823  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
9824  MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
9825  SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
9826                                           Ops, 5, T, MMO);
9827  SDValue cpOut =
9828    DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
9829  return cpOut;
9830}
9831
9832SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
9833                                                 SelectionDAG &DAG) const {
9834  assert(Subtarget->is64Bit() && "Result not type legalized?");
9835  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
9836  SDValue TheChain = Op.getOperand(0);
9837  DebugLoc dl = Op.getDebugLoc();
9838  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
9839  SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
9840  SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
9841                                   rax.getValue(2));
9842  SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
9843                            DAG.getConstant(32, MVT::i8));
9844  SDValue Ops[] = {
9845    DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
9846    rdx.getValue(1)
9847  };
9848  return DAG.getMergeValues(Ops, 2, dl);
9849}
9850
9851SDValue X86TargetLowering::LowerBITCAST(SDValue Op,
9852                                            SelectionDAG &DAG) const {
9853  EVT SrcVT = Op.getOperand(0).getValueType();
9854  EVT DstVT = Op.getValueType();
9855  assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
9856         Subtarget->hasMMX() && "Unexpected custom BITCAST");
9857  assert((DstVT == MVT::i64 ||
9858          (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
9859         "Unexpected custom BITCAST");
9860  // i64 <=> MMX conversions are Legal.
9861  if (SrcVT==MVT::i64 && DstVT.isVector())
9862    return Op;
9863  if (DstVT==MVT::i64 && SrcVT.isVector())
9864    return Op;
9865  // MMX <=> MMX conversions are Legal.
9866  if (SrcVT.isVector() && DstVT.isVector())
9867    return Op;
9868  // All other conversions need to be expanded.
9869  return SDValue();
9870}
9871
9872SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const {
9873  SDNode *Node = Op.getNode();
9874  DebugLoc dl = Node->getDebugLoc();
9875  EVT T = Node->getValueType(0);
9876  SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
9877                              DAG.getConstant(0, T), Node->getOperand(2));
9878  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
9879                       cast<AtomicSDNode>(Node)->getMemoryVT(),
9880                       Node->getOperand(0),
9881                       Node->getOperand(1), negOp,
9882                       cast<AtomicSDNode>(Node)->getSrcValue(),
9883                       cast<AtomicSDNode>(Node)->getAlignment(),
9884                       cast<AtomicSDNode>(Node)->getOrdering(),
9885                       cast<AtomicSDNode>(Node)->getSynchScope());
9886}
9887
9888static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
9889  EVT VT = Op.getNode()->getValueType(0);
9890
9891  // Let legalize expand this if it isn't a legal type yet.
9892  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
9893    return SDValue();
9894
9895  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
9896
9897  unsigned Opc;
9898  bool ExtraOp = false;
9899  switch (Op.getOpcode()) {
9900  default: assert(0 && "Invalid code");
9901  case ISD::ADDC: Opc = X86ISD::ADD; break;
9902  case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
9903  case ISD::SUBC: Opc = X86ISD::SUB; break;
9904  case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
9905  }
9906
9907  if (!ExtraOp)
9908    return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
9909                       Op.getOperand(1));
9910  return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
9911                     Op.getOperand(1), Op.getOperand(2));
9912}
9913
9914/// LowerOperation - Provide custom lowering hooks for some operations.
9915///
9916SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
9917  switch (Op.getOpcode()) {
9918  default: llvm_unreachable("Should not custom lower this!");
9919  case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
9920  case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op,DAG);
9921  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op,DAG);
9922  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
9923  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
9924  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
9925  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
9926  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
9927  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
9928  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
9929  case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op, DAG);
9930  case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, DAG);
9931  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
9932  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
9933  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
9934  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
9935  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
9936  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
9937  case ISD::SHL_PARTS:
9938  case ISD::SRA_PARTS:
9939  case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
9940  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
9941  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
9942  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
9943  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
9944  case ISD::FABS:               return LowerFABS(Op, DAG);
9945  case ISD::FNEG:               return LowerFNEG(Op, DAG);
9946  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
9947  case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
9948  case ISD::SETCC:              return LowerSETCC(Op, DAG);
9949  case ISD::VSETCC:             return LowerVSETCC(Op, DAG);
9950  case ISD::SELECT:             return LowerSELECT(Op, DAG);
9951  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
9952  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
9953  case ISD::VASTART:            return LowerVASTART(Op, DAG);
9954  case ISD::VAARG:              return LowerVAARG(Op, DAG);
9955  case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
9956  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
9957  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
9958  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
9959  case ISD::FRAME_TO_ARGS_OFFSET:
9960                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
9961  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
9962  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
9963  case ISD::TRAMPOLINE:         return LowerTRAMPOLINE(Op, DAG);
9964  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
9965  case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
9966  case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
9967  case ISD::MUL:                return LowerMUL_V2I64(Op, DAG);
9968  case ISD::SRA:
9969  case ISD::SRL:
9970  case ISD::SHL:                return LowerShift(Op, DAG);
9971  case ISD::SADDO:
9972  case ISD::UADDO:
9973  case ISD::SSUBO:
9974  case ISD::USUBO:
9975  case ISD::SMULO:
9976  case ISD::UMULO:              return LowerXALUO(Op, DAG);
9977  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, DAG);
9978  case ISD::BITCAST:            return LowerBITCAST(Op, DAG);
9979  case ISD::ADDC:
9980  case ISD::ADDE:
9981  case ISD::SUBC:
9982  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
9983  }
9984}
9985
9986void X86TargetLowering::
9987ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
9988                        SelectionDAG &DAG, unsigned NewOp) const {
9989  EVT T = Node->getValueType(0);
9990  DebugLoc dl = Node->getDebugLoc();
9991  assert (T == MVT::i64 && "Only know how to expand i64 atomics");
9992
9993  SDValue Chain = Node->getOperand(0);
9994  SDValue In1 = Node->getOperand(1);
9995  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
9996                             Node->getOperand(2), DAG.getIntPtrConstant(0));
9997  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
9998                             Node->getOperand(2), DAG.getIntPtrConstant(1));
9999  SDValue Ops[] = { Chain, In1, In2L, In2H };
10000  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
10001  SDValue Result =
10002    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64,
10003                            cast<MemSDNode>(Node)->getMemOperand());
10004  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
10005  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
10006  Results.push_back(Result.getValue(2));
10007}
10008
10009/// ReplaceNodeResults - Replace a node with an illegal result type
10010/// with a new node built out of custom code.
10011void X86TargetLowering::ReplaceNodeResults(SDNode *N,
10012                                           SmallVectorImpl<SDValue>&Results,
10013                                           SelectionDAG &DAG) const {
10014  DebugLoc dl = N->getDebugLoc();
10015  switch (N->getOpcode()) {
10016  default:
10017    assert(false && "Do not know how to custom type legalize this operation!");
10018    return;
10019  case ISD::SIGN_EXTEND_INREG:
10020  case ISD::ADDC:
10021  case ISD::ADDE:
10022  case ISD::SUBC:
10023  case ISD::SUBE:
10024    // We don't want to expand or promote these.
10025    return;
10026  case ISD::FP_TO_SINT: {
10027    std::pair<SDValue,SDValue> Vals =
10028        FP_TO_INTHelper(SDValue(N, 0), DAG, true);
10029    SDValue FIST = Vals.first, StackSlot = Vals.second;
10030    if (FIST.getNode() != 0) {
10031      EVT VT = N->getValueType(0);
10032      // Return a load from the stack slot.
10033      Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
10034                                    MachinePointerInfo(), false, false, 0));
10035    }
10036    return;
10037  }
10038  case ISD::READCYCLECOUNTER: {
10039    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
10040    SDValue TheChain = N->getOperand(0);
10041    SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
10042    SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
10043                                     rd.getValue(1));
10044    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
10045                                     eax.getValue(2));
10046    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
10047    SDValue Ops[] = { eax, edx };
10048    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
10049    Results.push_back(edx.getValue(1));
10050    return;
10051  }
10052  case ISD::ATOMIC_CMP_SWAP: {
10053    EVT T = N->getValueType(0);
10054    assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
10055    SDValue cpInL, cpInH;
10056    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
10057                        DAG.getConstant(0, MVT::i32));
10058    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
10059                        DAG.getConstant(1, MVT::i32));
10060    cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue());
10061    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH,
10062                             cpInL.getValue(1));
10063    SDValue swapInL, swapInH;
10064    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
10065                          DAG.getConstant(0, MVT::i32));
10066    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
10067                          DAG.getConstant(1, MVT::i32));
10068    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL,
10069                               cpInH.getValue(1));
10070    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH,
10071                               swapInL.getValue(1));
10072    SDValue Ops[] = { swapInH.getValue(0),
10073                      N->getOperand(1),
10074                      swapInH.getValue(1) };
10075    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
10076    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
10077    SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys,
10078                                             Ops, 3, T, MMO);
10079    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX,
10080                                        MVT::i32, Result.getValue(1));
10081    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX,
10082                                        MVT::i32, cpOutL.getValue(2));
10083    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
10084    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
10085    Results.push_back(cpOutH.getValue(1));
10086    return;
10087  }
10088  case ISD::ATOMIC_LOAD_ADD:
10089    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
10090    return;
10091  case ISD::ATOMIC_LOAD_AND:
10092    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
10093    return;
10094  case ISD::ATOMIC_LOAD_NAND:
10095    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
10096    return;
10097  case ISD::ATOMIC_LOAD_OR:
10098    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
10099    return;
10100  case ISD::ATOMIC_LOAD_SUB:
10101    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
10102    return;
10103  case ISD::ATOMIC_LOAD_XOR:
10104    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
10105    return;
10106  case ISD::ATOMIC_SWAP:
10107    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
10108    return;
10109  }
10110}
10111
10112const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
10113  switch (Opcode) {
10114  default: return NULL;
10115  case X86ISD::BSF:                return "X86ISD::BSF";
10116  case X86ISD::BSR:                return "X86ISD::BSR";
10117  case X86ISD::SHLD:               return "X86ISD::SHLD";
10118  case X86ISD::SHRD:               return "X86ISD::SHRD";
10119  case X86ISD::FAND:               return "X86ISD::FAND";
10120  case X86ISD::FOR:                return "X86ISD::FOR";
10121  case X86ISD::FXOR:               return "X86ISD::FXOR";
10122  case X86ISD::FSRL:               return "X86ISD::FSRL";
10123  case X86ISD::FILD:               return "X86ISD::FILD";
10124  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
10125  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
10126  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
10127  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
10128  case X86ISD::FLD:                return "X86ISD::FLD";
10129  case X86ISD::FST:                return "X86ISD::FST";
10130  case X86ISD::CALL:               return "X86ISD::CALL";
10131  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
10132  case X86ISD::BT:                 return "X86ISD::BT";
10133  case X86ISD::CMP:                return "X86ISD::CMP";
10134  case X86ISD::COMI:               return "X86ISD::COMI";
10135  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
10136  case X86ISD::SETCC:              return "X86ISD::SETCC";
10137  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
10138  case X86ISD::FSETCCsd:           return "X86ISD::FSETCCsd";
10139  case X86ISD::FSETCCss:           return "X86ISD::FSETCCss";
10140  case X86ISD::CMOV:               return "X86ISD::CMOV";
10141  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
10142  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
10143  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
10144  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
10145  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
10146  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
10147  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
10148  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
10149  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
10150  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
10151  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
10152  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
10153  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
10154  case X86ISD::ANDNP:              return "X86ISD::ANDNP";
10155  case X86ISD::PSIGNB:             return "X86ISD::PSIGNB";
10156  case X86ISD::PSIGNW:             return "X86ISD::PSIGNW";
10157  case X86ISD::PSIGND:             return "X86ISD::PSIGND";
10158  case X86ISD::PBLENDVB:           return "X86ISD::PBLENDVB";
10159  case X86ISD::FMAX:               return "X86ISD::FMAX";
10160  case X86ISD::FMIN:               return "X86ISD::FMIN";
10161  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
10162  case X86ISD::FRCP:               return "X86ISD::FRCP";
10163  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
10164  case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
10165  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
10166  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
10167  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
10168  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
10169  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
10170  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
10171  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
10172  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
10173  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
10174  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
10175  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
10176  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
10177  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
10178  case X86ISD::VSHL:               return "X86ISD::VSHL";
10179  case X86ISD::VSRL:               return "X86ISD::VSRL";
10180  case X86ISD::CMPPD:              return "X86ISD::CMPPD";
10181  case X86ISD::CMPPS:              return "X86ISD::CMPPS";
10182  case X86ISD::PCMPEQB:            return "X86ISD::PCMPEQB";
10183  case X86ISD::PCMPEQW:            return "X86ISD::PCMPEQW";
10184  case X86ISD::PCMPEQD:            return "X86ISD::PCMPEQD";
10185  case X86ISD::PCMPEQQ:            return "X86ISD::PCMPEQQ";
10186  case X86ISD::PCMPGTB:            return "X86ISD::PCMPGTB";
10187  case X86ISD::PCMPGTW:            return "X86ISD::PCMPGTW";
10188  case X86ISD::PCMPGTD:            return "X86ISD::PCMPGTD";
10189  case X86ISD::PCMPGTQ:            return "X86ISD::PCMPGTQ";
10190  case X86ISD::ADD:                return "X86ISD::ADD";
10191  case X86ISD::SUB:                return "X86ISD::SUB";
10192  case X86ISD::ADC:                return "X86ISD::ADC";
10193  case X86ISD::SBB:                return "X86ISD::SBB";
10194  case X86ISD::SMUL:               return "X86ISD::SMUL";
10195  case X86ISD::UMUL:               return "X86ISD::UMUL";
10196  case X86ISD::INC:                return "X86ISD::INC";
10197  case X86ISD::DEC:                return "X86ISD::DEC";
10198  case X86ISD::OR:                 return "X86ISD::OR";
10199  case X86ISD::XOR:                return "X86ISD::XOR";
10200  case X86ISD::AND:                return "X86ISD::AND";
10201  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
10202  case X86ISD::PTEST:              return "X86ISD::PTEST";
10203  case X86ISD::TESTP:              return "X86ISD::TESTP";
10204  case X86ISD::PALIGN:             return "X86ISD::PALIGN";
10205  case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
10206  case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
10207  case X86ISD::PSHUFHW_LD:         return "X86ISD::PSHUFHW_LD";
10208  case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
10209  case X86ISD::PSHUFLW_LD:         return "X86ISD::PSHUFLW_LD";
10210  case X86ISD::SHUFPS:             return "X86ISD::SHUFPS";
10211  case X86ISD::SHUFPD:             return "X86ISD::SHUFPD";
10212  case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
10213  case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
10214  case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
10215  case X86ISD::MOVHLPD:            return "X86ISD::MOVHLPD";
10216  case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
10217  case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
10218  case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
10219  case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
10220  case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
10221  case X86ISD::MOVSHDUP_LD:        return "X86ISD::MOVSHDUP_LD";
10222  case X86ISD::MOVSLDUP_LD:        return "X86ISD::MOVSLDUP_LD";
10223  case X86ISD::MOVSD:              return "X86ISD::MOVSD";
10224  case X86ISD::MOVSS:              return "X86ISD::MOVSS";
10225  case X86ISD::UNPCKLPS:           return "X86ISD::UNPCKLPS";
10226  case X86ISD::UNPCKLPD:           return "X86ISD::UNPCKLPD";
10227  case X86ISD::VUNPCKLPDY:         return "X86ISD::VUNPCKLPDY";
10228  case X86ISD::UNPCKHPS:           return "X86ISD::UNPCKHPS";
10229  case X86ISD::UNPCKHPD:           return "X86ISD::UNPCKHPD";
10230  case X86ISD::PUNPCKLBW:          return "X86ISD::PUNPCKLBW";
10231  case X86ISD::PUNPCKLWD:          return "X86ISD::PUNPCKLWD";
10232  case X86ISD::PUNPCKLDQ:          return "X86ISD::PUNPCKLDQ";
10233  case X86ISD::PUNPCKLQDQ:         return "X86ISD::PUNPCKLQDQ";
10234  case X86ISD::PUNPCKHBW:          return "X86ISD::PUNPCKHBW";
10235  case X86ISD::PUNPCKHWD:          return "X86ISD::PUNPCKHWD";
10236  case X86ISD::PUNPCKHDQ:          return "X86ISD::PUNPCKHDQ";
10237  case X86ISD::PUNPCKHQDQ:         return "X86ISD::PUNPCKHQDQ";
10238  case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
10239  case X86ISD::VPERMILPS:          return "X86ISD::VPERMILPS";
10240  case X86ISD::VPERMILPSY:         return "X86ISD::VPERMILPSY";
10241  case X86ISD::VPERMILPD:          return "X86ISD::VPERMILPD";
10242  case X86ISD::VPERMILPDY:         return "X86ISD::VPERMILPDY";
10243  case X86ISD::VPERM2F128:         return "X86ISD::VPERM2F128";
10244  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
10245  case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
10246  case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
10247  case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
10248  }
10249}
10250
10251// isLegalAddressingMode - Return true if the addressing mode represented
10252// by AM is legal for this target, for a load/store of the specified type.
10253bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
10254                                              Type *Ty) const {
10255  // X86 supports extremely general addressing modes.
10256  CodeModel::Model M = getTargetMachine().getCodeModel();
10257  Reloc::Model R = getTargetMachine().getRelocationModel();
10258
10259  // X86 allows a sign-extended 32-bit immediate field as a displacement.
10260  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
10261    return false;
10262
10263  if (AM.BaseGV) {
10264    unsigned GVFlags =
10265      Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
10266
10267    // If a reference to this global requires an extra load, we can't fold it.
10268    if (isGlobalStubReference(GVFlags))
10269      return false;
10270
10271    // If BaseGV requires a register for the PIC base, we cannot also have a
10272    // BaseReg specified.
10273    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
10274      return false;
10275
10276    // If lower 4G is not available, then we must use rip-relative addressing.
10277    if ((M != CodeModel::Small || R != Reloc::Static) &&
10278        Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
10279      return false;
10280  }
10281
10282  switch (AM.Scale) {
10283  case 0:
10284  case 1:
10285  case 2:
10286  case 4:
10287  case 8:
10288    // These scales always work.
10289    break;
10290  case 3:
10291  case 5:
10292  case 9:
10293    // These scales are formed with basereg+scalereg.  Only accept if there is
10294    // no basereg yet.
10295    if (AM.HasBaseReg)
10296      return false;
10297    break;
10298  default:  // Other stuff never works.
10299    return false;
10300  }
10301
10302  return true;
10303}
10304
10305
10306bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
10307  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
10308    return false;
10309  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
10310  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
10311  if (NumBits1 <= NumBits2)
10312    return false;
10313  return true;
10314}
10315
10316bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
10317  if (!VT1.isInteger() || !VT2.isInteger())
10318    return false;
10319  unsigned NumBits1 = VT1.getSizeInBits();
10320  unsigned NumBits2 = VT2.getSizeInBits();
10321  if (NumBits1 <= NumBits2)
10322    return false;
10323  return true;
10324}
10325
10326bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
10327  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
10328  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
10329}
10330
10331bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
10332  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
10333  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
10334}
10335
10336bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
10337  // i16 instructions are longer (0x66 prefix) and potentially slower.
10338  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
10339}
10340
10341/// isShuffleMaskLegal - Targets can use this to indicate that they only
10342/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
10343/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
10344/// are assumed to be legal.
10345bool
10346X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
10347                                      EVT VT) const {
10348  // Very little shuffling can be done for 64-bit vectors right now.
10349  if (VT.getSizeInBits() == 64)
10350    return isPALIGNRMask(M, VT, Subtarget->hasSSSE3());
10351
10352  // FIXME: pshufb, blends, shifts.
10353  return (VT.getVectorNumElements() == 2 ||
10354          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
10355          isMOVLMask(M, VT) ||
10356          isSHUFPMask(M, VT) ||
10357          isPSHUFDMask(M, VT) ||
10358          isPSHUFHWMask(M, VT) ||
10359          isPSHUFLWMask(M, VT) ||
10360          isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ||
10361          isUNPCKLMask(M, VT) ||
10362          isUNPCKHMask(M, VT) ||
10363          isUNPCKL_v_undef_Mask(M, VT) ||
10364          isUNPCKH_v_undef_Mask(M, VT));
10365}
10366
10367bool
10368X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
10369                                          EVT VT) const {
10370  unsigned NumElts = VT.getVectorNumElements();
10371  // FIXME: This collection of masks seems suspect.
10372  if (NumElts == 2)
10373    return true;
10374  if (NumElts == 4 && VT.getSizeInBits() == 128) {
10375    return (isMOVLMask(Mask, VT)  ||
10376            isCommutedMOVLMask(Mask, VT, true) ||
10377            isSHUFPMask(Mask, VT) ||
10378            isCommutedSHUFPMask(Mask, VT));
10379  }
10380  return false;
10381}
10382
10383//===----------------------------------------------------------------------===//
10384//                           X86 Scheduler Hooks
10385//===----------------------------------------------------------------------===//
10386
10387// private utility function
10388MachineBasicBlock *
10389X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
10390                                                       MachineBasicBlock *MBB,
10391                                                       unsigned regOpc,
10392                                                       unsigned immOpc,
10393                                                       unsigned LoadOpc,
10394                                                       unsigned CXchgOpc,
10395                                                       unsigned notOpc,
10396                                                       unsigned EAXreg,
10397                                                       TargetRegisterClass *RC,
10398                                                       bool invSrc) const {
10399  // For the atomic bitwise operator, we generate
10400  //   thisMBB:
10401  //   newMBB:
10402  //     ld  t1 = [bitinstr.addr]
10403  //     op  t2 = t1, [bitinstr.val]
10404  //     mov EAX = t1
10405  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
10406  //     bz  newMBB
10407  //     fallthrough -->nextMBB
10408  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
10409  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
10410  MachineFunction::iterator MBBIter = MBB;
10411  ++MBBIter;
10412
10413  /// First build the CFG
10414  MachineFunction *F = MBB->getParent();
10415  MachineBasicBlock *thisMBB = MBB;
10416  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
10417  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
10418  F->insert(MBBIter, newMBB);
10419  F->insert(MBBIter, nextMBB);
10420
10421  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
10422  nextMBB->splice(nextMBB->begin(), thisMBB,
10423                  llvm::next(MachineBasicBlock::iterator(bInstr)),
10424                  thisMBB->end());
10425  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
10426
10427  // Update thisMBB to fall through to newMBB
10428  thisMBB->addSuccessor(newMBB);
10429
10430  // newMBB jumps to itself and fall through to nextMBB
10431  newMBB->addSuccessor(nextMBB);
10432  newMBB->addSuccessor(newMBB);
10433
10434  // Insert instructions into newMBB based on incoming instruction
10435  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
10436         "unexpected number of operands");
10437  DebugLoc dl = bInstr->getDebugLoc();
10438  MachineOperand& destOper = bInstr->getOperand(0);
10439  MachineOperand* argOpers[2 + X86::AddrNumOperands];
10440  int numArgs = bInstr->getNumOperands() - 1;
10441  for (int i=0; i < numArgs; ++i)
10442    argOpers[i] = &bInstr->getOperand(i+1);
10443
10444  // x86 address has 4 operands: base, index, scale, and displacement
10445  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
10446  int valArgIndx = lastAddrIndx + 1;
10447
10448  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
10449  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
10450  for (int i=0; i <= lastAddrIndx; ++i)
10451    (*MIB).addOperand(*argOpers[i]);
10452
10453  unsigned tt = F->getRegInfo().createVirtualRegister(RC);
10454  if (invSrc) {
10455    MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1);
10456  }
10457  else
10458    tt = t1;
10459
10460  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
10461  assert((argOpers[valArgIndx]->isReg() ||
10462          argOpers[valArgIndx]->isImm()) &&
10463         "invalid operand");
10464  if (argOpers[valArgIndx]->isReg())
10465    MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
10466  else
10467    MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
10468  MIB.addReg(tt);
10469  (*MIB).addOperand(*argOpers[valArgIndx]);
10470
10471  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg);
10472  MIB.addReg(t1);
10473
10474  MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
10475  for (int i=0; i <= lastAddrIndx; ++i)
10476    (*MIB).addOperand(*argOpers[i]);
10477  MIB.addReg(t2);
10478  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
10479  (*MIB).setMemRefs(bInstr->memoperands_begin(),
10480                    bInstr->memoperands_end());
10481
10482  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
10483  MIB.addReg(EAXreg);
10484
10485  // insert branch
10486  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
10487
10488  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
10489  return nextMBB;
10490}
10491
10492// private utility function:  64 bit atomics on 32 bit host.
10493MachineBasicBlock *
10494X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
10495                                                       MachineBasicBlock *MBB,
10496                                                       unsigned regOpcL,
10497                                                       unsigned regOpcH,
10498                                                       unsigned immOpcL,
10499                                                       unsigned immOpcH,
10500                                                       bool invSrc) const {
10501  // For the atomic bitwise operator, we generate
10502  //   thisMBB (instructions are in pairs, except cmpxchg8b)
10503  //     ld t1,t2 = [bitinstr.addr]
10504  //   newMBB:
10505  //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
10506  //     op  t5, t6 <- out1, out2, [bitinstr.val]
10507  //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
10508  //     mov ECX, EBX <- t5, t6
10509  //     mov EAX, EDX <- t1, t2
10510  //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
10511  //     mov t3, t4 <- EAX, EDX
10512  //     bz  newMBB
10513  //     result in out1, out2
10514  //     fallthrough -->nextMBB
10515
10516  const TargetRegisterClass *RC = X86::GR32RegisterClass;
10517  const unsigned LoadOpc = X86::MOV32rm;
10518  const unsigned NotOpc = X86::NOT32r;
10519  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
10520  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
10521  MachineFunction::iterator MBBIter = MBB;
10522  ++MBBIter;
10523
10524  /// First build the CFG
10525  MachineFunction *F = MBB->getParent();
10526  MachineBasicBlock *thisMBB = MBB;
10527  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
10528  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
10529  F->insert(MBBIter, newMBB);
10530  F->insert(MBBIter, nextMBB);
10531
10532  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
10533  nextMBB->splice(nextMBB->begin(), thisMBB,
10534                  llvm::next(MachineBasicBlock::iterator(bInstr)),
10535                  thisMBB->end());
10536  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
10537
10538  // Update thisMBB to fall through to newMBB
10539  thisMBB->addSuccessor(newMBB);
10540
10541  // newMBB jumps to itself and fall through to nextMBB
10542  newMBB->addSuccessor(nextMBB);
10543  newMBB->addSuccessor(newMBB);
10544
10545  DebugLoc dl = bInstr->getDebugLoc();
10546  // Insert instructions into newMBB based on incoming instruction
10547  // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
10548  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 &&
10549         "unexpected number of operands");
10550  MachineOperand& dest1Oper = bInstr->getOperand(0);
10551  MachineOperand& dest2Oper = bInstr->getOperand(1);
10552  MachineOperand* argOpers[2 + X86::AddrNumOperands];
10553  for (int i=0; i < 2 + X86::AddrNumOperands; ++i) {
10554    argOpers[i] = &bInstr->getOperand(i+2);
10555
10556    // We use some of the operands multiple times, so conservatively just
10557    // clear any kill flags that might be present.
10558    if (argOpers[i]->isReg() && argOpers[i]->isUse())
10559      argOpers[i]->setIsKill(false);
10560  }
10561
10562  // x86 address has 5 operands: base, index, scale, displacement, and segment.
10563  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
10564
10565  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
10566  MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
10567  for (int i=0; i <= lastAddrIndx; ++i)
10568    (*MIB).addOperand(*argOpers[i]);
10569  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
10570  MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
10571  // add 4 to displacement.
10572  for (int i=0; i <= lastAddrIndx-2; ++i)
10573    (*MIB).addOperand(*argOpers[i]);
10574  MachineOperand newOp3 = *(argOpers[3]);
10575  if (newOp3.isImm())
10576    newOp3.setImm(newOp3.getImm()+4);
10577  else
10578    newOp3.setOffset(newOp3.getOffset()+4);
10579  (*MIB).addOperand(newOp3);
10580  (*MIB).addOperand(*argOpers[lastAddrIndx]);
10581
10582  // t3/4 are defined later, at the bottom of the loop
10583  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
10584  unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
10585  BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
10586    .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
10587  BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
10588    .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
10589
10590  // The subsequent operations should be using the destination registers of
10591  //the PHI instructions.
10592  if (invSrc) {
10593    t1 = F->getRegInfo().createVirtualRegister(RC);
10594    t2 = F->getRegInfo().createVirtualRegister(RC);
10595    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg());
10596    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg());
10597  } else {
10598    t1 = dest1Oper.getReg();
10599    t2 = dest2Oper.getReg();
10600  }
10601
10602  int valArgIndx = lastAddrIndx + 1;
10603  assert((argOpers[valArgIndx]->isReg() ||
10604          argOpers[valArgIndx]->isImm()) &&
10605         "invalid operand");
10606  unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
10607  unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
10608  if (argOpers[valArgIndx]->isReg())
10609    MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
10610  else
10611    MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
10612  if (regOpcL != X86::MOV32rr)
10613    MIB.addReg(t1);
10614  (*MIB).addOperand(*argOpers[valArgIndx]);
10615  assert(argOpers[valArgIndx + 1]->isReg() ==
10616         argOpers[valArgIndx]->isReg());
10617  assert(argOpers[valArgIndx + 1]->isImm() ==
10618         argOpers[valArgIndx]->isImm());
10619  if (argOpers[valArgIndx + 1]->isReg())
10620    MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
10621  else
10622    MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
10623  if (regOpcH != X86::MOV32rr)
10624    MIB.addReg(t2);
10625  (*MIB).addOperand(*argOpers[valArgIndx + 1]);
10626
10627  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
10628  MIB.addReg(t1);
10629  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX);
10630  MIB.addReg(t2);
10631
10632  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX);
10633  MIB.addReg(t5);
10634  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX);
10635  MIB.addReg(t6);
10636
10637  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
10638  for (int i=0; i <= lastAddrIndx; ++i)
10639    (*MIB).addOperand(*argOpers[i]);
10640
10641  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
10642  (*MIB).setMemRefs(bInstr->memoperands_begin(),
10643                    bInstr->memoperands_end());
10644
10645  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3);
10646  MIB.addReg(X86::EAX);
10647  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4);
10648  MIB.addReg(X86::EDX);
10649
10650  // insert branch
10651  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
10652
10653  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
10654  return nextMBB;
10655}
10656
10657// private utility function
10658MachineBasicBlock *
10659X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
10660                                                      MachineBasicBlock *MBB,
10661                                                      unsigned cmovOpc) const {
10662  // For the atomic min/max operator, we generate
10663  //   thisMBB:
10664  //   newMBB:
10665  //     ld t1 = [min/max.addr]
10666  //     mov t2 = [min/max.val]
10667  //     cmp  t1, t2
10668  //     cmov[cond] t2 = t1
10669  //     mov EAX = t1
10670  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
10671  //     bz   newMBB
10672  //     fallthrough -->nextMBB
10673  //
10674  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
10675  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
10676  MachineFunction::iterator MBBIter = MBB;
10677  ++MBBIter;
10678
10679  /// First build the CFG
10680  MachineFunction *F = MBB->getParent();
10681  MachineBasicBlock *thisMBB = MBB;
10682  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
10683  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
10684  F->insert(MBBIter, newMBB);
10685  F->insert(MBBIter, nextMBB);
10686
10687  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
10688  nextMBB->splice(nextMBB->begin(), thisMBB,
10689                  llvm::next(MachineBasicBlock::iterator(mInstr)),
10690                  thisMBB->end());
10691  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
10692
10693  // Update thisMBB to fall through to newMBB
10694  thisMBB->addSuccessor(newMBB);
10695
10696  // newMBB jumps to newMBB and fall through to nextMBB
10697  newMBB->addSuccessor(nextMBB);
10698  newMBB->addSuccessor(newMBB);
10699
10700  DebugLoc dl = mInstr->getDebugLoc();
10701  // Insert instructions into newMBB based on incoming instruction
10702  assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
10703         "unexpected number of operands");
10704  MachineOperand& destOper = mInstr->getOperand(0);
10705  MachineOperand* argOpers[2 + X86::AddrNumOperands];
10706  int numArgs = mInstr->getNumOperands() - 1;
10707  for (int i=0; i < numArgs; ++i)
10708    argOpers[i] = &mInstr->getOperand(i+1);
10709
10710  // x86 address has 4 operands: base, index, scale, and displacement
10711  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
10712  int valArgIndx = lastAddrIndx + 1;
10713
10714  unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
10715  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
10716  for (int i=0; i <= lastAddrIndx; ++i)
10717    (*MIB).addOperand(*argOpers[i]);
10718
10719  // We only support register and immediate values
10720  assert((argOpers[valArgIndx]->isReg() ||
10721          argOpers[valArgIndx]->isImm()) &&
10722         "invalid operand");
10723
10724  unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
10725  if (argOpers[valArgIndx]->isReg())
10726    MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2);
10727  else
10728    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
10729  (*MIB).addOperand(*argOpers[valArgIndx]);
10730
10731  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
10732  MIB.addReg(t1);
10733
10734  MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
10735  MIB.addReg(t1);
10736  MIB.addReg(t2);
10737
10738  // Generate movc
10739  unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
10740  MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
10741  MIB.addReg(t2);
10742  MIB.addReg(t1);
10743
10744  // Cmp and exchange if none has modified the memory location
10745  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
10746  for (int i=0; i <= lastAddrIndx; ++i)
10747    (*MIB).addOperand(*argOpers[i]);
10748  MIB.addReg(t3);
10749  assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
10750  (*MIB).setMemRefs(mInstr->memoperands_begin(),
10751                    mInstr->memoperands_end());
10752
10753  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
10754  MIB.addReg(X86::EAX);
10755
10756  // insert branch
10757  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
10758
10759  mInstr->eraseFromParent();   // The pseudo instruction is gone now.
10760  return nextMBB;
10761}
10762
10763// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
10764// or XMM0_V32I8 in AVX all of this code can be replaced with that
10765// in the .td file.
10766MachineBasicBlock *
10767X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
10768                            unsigned numArgs, bool memArg) const {
10769  assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) &&
10770         "Target must have SSE4.2 or AVX features enabled");
10771
10772  DebugLoc dl = MI->getDebugLoc();
10773  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
10774  unsigned Opc;
10775  if (!Subtarget->hasAVX()) {
10776    if (memArg)
10777      Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
10778    else
10779      Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
10780  } else {
10781    if (memArg)
10782      Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm;
10783    else
10784      Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr;
10785  }
10786
10787  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
10788  for (unsigned i = 0; i < numArgs; ++i) {
10789    MachineOperand &Op = MI->getOperand(i+1);
10790    if (!(Op.isReg() && Op.isImplicit()))
10791      MIB.addOperand(Op);
10792  }
10793  BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg())
10794    .addReg(X86::XMM0);
10795
10796  MI->eraseFromParent();
10797  return BB;
10798}
10799
10800MachineBasicBlock *
10801X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const {
10802  DebugLoc dl = MI->getDebugLoc();
10803  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
10804
10805  // Address into RAX/EAX, other two args into ECX, EDX.
10806  unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
10807  unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
10808  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
10809  for (int i = 0; i < X86::AddrNumOperands; ++i)
10810    MIB.addOperand(MI->getOperand(i));
10811
10812  unsigned ValOps = X86::AddrNumOperands;
10813  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
10814    .addReg(MI->getOperand(ValOps).getReg());
10815  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
10816    .addReg(MI->getOperand(ValOps+1).getReg());
10817
10818  // The instruction doesn't actually take any operands though.
10819  BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
10820
10821  MI->eraseFromParent(); // The pseudo is gone now.
10822  return BB;
10823}
10824
10825MachineBasicBlock *
10826X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const {
10827  DebugLoc dl = MI->getDebugLoc();
10828  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
10829
10830  // First arg in ECX, the second in EAX.
10831  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
10832    .addReg(MI->getOperand(0).getReg());
10833  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
10834    .addReg(MI->getOperand(1).getReg());
10835
10836  // The instruction doesn't actually take any operands though.
10837  BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr));
10838
10839  MI->eraseFromParent(); // The pseudo is gone now.
10840  return BB;
10841}
10842
10843MachineBasicBlock *
10844X86TargetLowering::EmitVAARG64WithCustomInserter(
10845                   MachineInstr *MI,
10846                   MachineBasicBlock *MBB) const {
10847  // Emit va_arg instruction on X86-64.
10848
10849  // Operands to this pseudo-instruction:
10850  // 0  ) Output        : destination address (reg)
10851  // 1-5) Input         : va_list address (addr, i64mem)
10852  // 6  ) ArgSize       : Size (in bytes) of vararg type
10853  // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
10854  // 8  ) Align         : Alignment of type
10855  // 9  ) EFLAGS (implicit-def)
10856
10857  assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
10858  assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
10859
10860  unsigned DestReg = MI->getOperand(0).getReg();
10861  MachineOperand &Base = MI->getOperand(1);
10862  MachineOperand &Scale = MI->getOperand(2);
10863  MachineOperand &Index = MI->getOperand(3);
10864  MachineOperand &Disp = MI->getOperand(4);
10865  MachineOperand &Segment = MI->getOperand(5);
10866  unsigned ArgSize = MI->getOperand(6).getImm();
10867  unsigned ArgMode = MI->getOperand(7).getImm();
10868  unsigned Align = MI->getOperand(8).getImm();
10869
10870  // Memory Reference
10871  assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
10872  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
10873  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
10874
10875  // Machine Information
10876  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
10877  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
10878  const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
10879  const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
10880  DebugLoc DL = MI->getDebugLoc();
10881
10882  // struct va_list {
10883  //   i32   gp_offset
10884  //   i32   fp_offset
10885  //   i64   overflow_area (address)
10886  //   i64   reg_save_area (address)
10887  // }
10888  // sizeof(va_list) = 24
10889  // alignment(va_list) = 8
10890
10891  unsigned TotalNumIntRegs = 6;
10892  unsigned TotalNumXMMRegs = 8;
10893  bool UseGPOffset = (ArgMode == 1);
10894  bool UseFPOffset = (ArgMode == 2);
10895  unsigned MaxOffset = TotalNumIntRegs * 8 +
10896                       (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
10897
10898  /* Align ArgSize to a multiple of 8 */
10899  unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
10900  bool NeedsAlign = (Align > 8);
10901
10902  MachineBasicBlock *thisMBB = MBB;
10903  MachineBasicBlock *overflowMBB;
10904  MachineBasicBlock *offsetMBB;
10905  MachineBasicBlock *endMBB;
10906
10907  unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
10908  unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
10909  unsigned OffsetReg = 0;
10910
10911  if (!UseGPOffset && !UseFPOffset) {
10912    // If we only pull from the overflow region, we don't create a branch.
10913    // We don't need to alter control flow.
10914    OffsetDestReg = 0; // unused
10915    OverflowDestReg = DestReg;
10916
10917    offsetMBB = NULL;
10918    overflowMBB = thisMBB;
10919    endMBB = thisMBB;
10920  } else {
10921    // First emit code to check if gp_offset (or fp_offset) is below the bound.
10922    // If so, pull the argument from reg_save_area. (branch to offsetMBB)
10923    // If not, pull from overflow_area. (branch to overflowMBB)
10924    //
10925    //       thisMBB
10926    //         |     .
10927    //         |        .
10928    //     offsetMBB   overflowMBB
10929    //         |        .
10930    //         |     .
10931    //        endMBB
10932
10933    // Registers for the PHI in endMBB
10934    OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
10935    OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
10936
10937    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
10938    MachineFunction *MF = MBB->getParent();
10939    overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
10940    offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
10941    endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
10942
10943    MachineFunction::iterator MBBIter = MBB;
10944    ++MBBIter;
10945
10946    // Insert the new basic blocks
10947    MF->insert(MBBIter, offsetMBB);
10948    MF->insert(MBBIter, overflowMBB);
10949    MF->insert(MBBIter, endMBB);
10950
10951    // Transfer the remainder of MBB and its successor edges to endMBB.
10952    endMBB->splice(endMBB->begin(), thisMBB,
10953                    llvm::next(MachineBasicBlock::iterator(MI)),
10954                    thisMBB->end());
10955    endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
10956
10957    // Make offsetMBB and overflowMBB successors of thisMBB
10958    thisMBB->addSuccessor(offsetMBB);
10959    thisMBB->addSuccessor(overflowMBB);
10960
10961    // endMBB is a successor of both offsetMBB and overflowMBB
10962    offsetMBB->addSuccessor(endMBB);
10963    overflowMBB->addSuccessor(endMBB);
10964
10965    // Load the offset value into a register
10966    OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
10967    BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
10968      .addOperand(Base)
10969      .addOperand(Scale)
10970      .addOperand(Index)
10971      .addDisp(Disp, UseFPOffset ? 4 : 0)
10972      .addOperand(Segment)
10973      .setMemRefs(MMOBegin, MMOEnd);
10974
10975    // Check if there is enough room left to pull this argument.
10976    BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
10977      .addReg(OffsetReg)
10978      .addImm(MaxOffset + 8 - ArgSizeA8);
10979
10980    // Branch to "overflowMBB" if offset >= max
10981    // Fall through to "offsetMBB" otherwise
10982    BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
10983      .addMBB(overflowMBB);
10984  }
10985
10986  // In offsetMBB, emit code to use the reg_save_area.
10987  if (offsetMBB) {
10988    assert(OffsetReg != 0);
10989
10990    // Read the reg_save_area address.
10991    unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
10992    BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
10993      .addOperand(Base)
10994      .addOperand(Scale)
10995      .addOperand(Index)
10996      .addDisp(Disp, 16)
10997      .addOperand(Segment)
10998      .setMemRefs(MMOBegin, MMOEnd);
10999
11000    // Zero-extend the offset
11001    unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
11002      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
11003        .addImm(0)
11004        .addReg(OffsetReg)
11005        .addImm(X86::sub_32bit);
11006
11007    // Add the offset to the reg_save_area to get the final address.
11008    BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
11009      .addReg(OffsetReg64)
11010      .addReg(RegSaveReg);
11011
11012    // Compute the offset for the next argument
11013    unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
11014    BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
11015      .addReg(OffsetReg)
11016      .addImm(UseFPOffset ? 16 : 8);
11017
11018    // Store it back into the va_list.
11019    BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
11020      .addOperand(Base)
11021      .addOperand(Scale)
11022      .addOperand(Index)
11023      .addDisp(Disp, UseFPOffset ? 4 : 0)
11024      .addOperand(Segment)
11025      .addReg(NextOffsetReg)
11026      .setMemRefs(MMOBegin, MMOEnd);
11027
11028    // Jump to endMBB
11029    BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
11030      .addMBB(endMBB);
11031  }
11032
11033  //
11034  // Emit code to use overflow area
11035  //
11036
11037  // Load the overflow_area address into a register.
11038  unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
11039  BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
11040    .addOperand(Base)
11041    .addOperand(Scale)
11042    .addOperand(Index)
11043    .addDisp(Disp, 8)
11044    .addOperand(Segment)
11045    .setMemRefs(MMOBegin, MMOEnd);
11046
11047  // If we need to align it, do so. Otherwise, just copy the address
11048  // to OverflowDestReg.
11049  if (NeedsAlign) {
11050    // Align the overflow address
11051    assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
11052    unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
11053
11054    // aligned_addr = (addr + (align-1)) & ~(align-1)
11055    BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
11056      .addReg(OverflowAddrReg)
11057      .addImm(Align-1);
11058
11059    BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
11060      .addReg(TmpReg)
11061      .addImm(~(uint64_t)(Align-1));
11062  } else {
11063    BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
11064      .addReg(OverflowAddrReg);
11065  }
11066
11067  // Compute the next overflow address after this argument.
11068  // (the overflow address should be kept 8-byte aligned)
11069  unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
11070  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
11071    .addReg(OverflowDestReg)
11072    .addImm(ArgSizeA8);
11073
11074  // Store the new overflow address.
11075  BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
11076    .addOperand(Base)
11077    .addOperand(Scale)
11078    .addOperand(Index)
11079    .addDisp(Disp, 8)
11080    .addOperand(Segment)
11081    .addReg(NextAddrReg)
11082    .setMemRefs(MMOBegin, MMOEnd);
11083
11084  // If we branched, emit the PHI to the front of endMBB.
11085  if (offsetMBB) {
11086    BuildMI(*endMBB, endMBB->begin(), DL,
11087            TII->get(X86::PHI), DestReg)
11088      .addReg(OffsetDestReg).addMBB(offsetMBB)
11089      .addReg(OverflowDestReg).addMBB(overflowMBB);
11090  }
11091
11092  // Erase the pseudo instruction
11093  MI->eraseFromParent();
11094
11095  return endMBB;
11096}
11097
11098MachineBasicBlock *
11099X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
11100                                                 MachineInstr *MI,
11101                                                 MachineBasicBlock *MBB) const {
11102  // Emit code to save XMM registers to the stack. The ABI says that the
11103  // number of registers to save is given in %al, so it's theoretically
11104  // possible to do an indirect jump trick to avoid saving all of them,
11105  // however this code takes a simpler approach and just executes all
11106  // of the stores if %al is non-zero. It's less code, and it's probably
11107  // easier on the hardware branch predictor, and stores aren't all that
11108  // expensive anyway.
11109
11110  // Create the new basic blocks. One block contains all the XMM stores,
11111  // and one block is the final destination regardless of whether any
11112  // stores were performed.
11113  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
11114  MachineFunction *F = MBB->getParent();
11115  MachineFunction::iterator MBBIter = MBB;
11116  ++MBBIter;
11117  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
11118  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
11119  F->insert(MBBIter, XMMSaveMBB);
11120  F->insert(MBBIter, EndMBB);
11121
11122  // Transfer the remainder of MBB and its successor edges to EndMBB.
11123  EndMBB->splice(EndMBB->begin(), MBB,
11124                 llvm::next(MachineBasicBlock::iterator(MI)),
11125                 MBB->end());
11126  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
11127
11128  // The original block will now fall through to the XMM save block.
11129  MBB->addSuccessor(XMMSaveMBB);
11130  // The XMMSaveMBB will fall through to the end block.
11131  XMMSaveMBB->addSuccessor(EndMBB);
11132
11133  // Now add the instructions.
11134  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
11135  DebugLoc DL = MI->getDebugLoc();
11136
11137  unsigned CountReg = MI->getOperand(0).getReg();
11138  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
11139  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
11140
11141  if (!Subtarget->isTargetWin64()) {
11142    // If %al is 0, branch around the XMM save block.
11143    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
11144    BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
11145    MBB->addSuccessor(EndMBB);
11146  }
11147
11148  // In the XMM save block, save all the XMM argument registers.
11149  for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
11150    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
11151    MachineMemOperand *MMO =
11152      F->getMachineMemOperand(
11153          MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
11154        MachineMemOperand::MOStore,
11155        /*Size=*/16, /*Align=*/16);
11156    BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr))
11157      .addFrameIndex(RegSaveFrameIndex)
11158      .addImm(/*Scale=*/1)
11159      .addReg(/*IndexReg=*/0)
11160      .addImm(/*Disp=*/Offset)
11161      .addReg(/*Segment=*/0)
11162      .addReg(MI->getOperand(i).getReg())
11163      .addMemOperand(MMO);
11164  }
11165
11166  MI->eraseFromParent();   // The pseudo instruction is gone now.
11167
11168  return EndMBB;
11169}
11170
11171MachineBasicBlock *
11172X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
11173                                     MachineBasicBlock *BB) const {
11174  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
11175  DebugLoc DL = MI->getDebugLoc();
11176
11177  // To "insert" a SELECT_CC instruction, we actually have to insert the
11178  // diamond control-flow pattern.  The incoming instruction knows the
11179  // destination vreg to set, the condition code register to branch on, the
11180  // true/false values to select between, and a branch opcode to use.
11181  const BasicBlock *LLVM_BB = BB->getBasicBlock();
11182  MachineFunction::iterator It = BB;
11183  ++It;
11184
11185  //  thisMBB:
11186  //  ...
11187  //   TrueVal = ...
11188  //   cmpTY ccX, r1, r2
11189  //   bCC copy1MBB
11190  //   fallthrough --> copy0MBB
11191  MachineBasicBlock *thisMBB = BB;
11192  MachineFunction *F = BB->getParent();
11193  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
11194  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
11195  F->insert(It, copy0MBB);
11196  F->insert(It, sinkMBB);
11197
11198  // If the EFLAGS register isn't dead in the terminator, then claim that it's
11199  // live into the sink and copy blocks.
11200  const MachineFunction *MF = BB->getParent();
11201  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
11202  BitVector ReservedRegs = TRI->getReservedRegs(*MF);
11203
11204  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
11205    const MachineOperand &MO = MI->getOperand(I);
11206    if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue;
11207    unsigned Reg = MO.getReg();
11208    if (Reg != X86::EFLAGS) continue;
11209    copy0MBB->addLiveIn(Reg);
11210    sinkMBB->addLiveIn(Reg);
11211  }
11212
11213  // Transfer the remainder of BB and its successor edges to sinkMBB.
11214  sinkMBB->splice(sinkMBB->begin(), BB,
11215                  llvm::next(MachineBasicBlock::iterator(MI)),
11216                  BB->end());
11217  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
11218
11219  // Add the true and fallthrough blocks as its successors.
11220  BB->addSuccessor(copy0MBB);
11221  BB->addSuccessor(sinkMBB);
11222
11223  // Create the conditional branch instruction.
11224  unsigned Opc =
11225    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
11226  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
11227
11228  //  copy0MBB:
11229  //   %FalseValue = ...
11230  //   # fallthrough to sinkMBB
11231  copy0MBB->addSuccessor(sinkMBB);
11232
11233  //  sinkMBB:
11234  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
11235  //  ...
11236  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
11237          TII->get(X86::PHI), MI->getOperand(0).getReg())
11238    .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
11239    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
11240
11241  MI->eraseFromParent();   // The pseudo instruction is gone now.
11242  return sinkMBB;
11243}
11244
11245MachineBasicBlock *
11246X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
11247                                          MachineBasicBlock *BB) const {
11248  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
11249  DebugLoc DL = MI->getDebugLoc();
11250
11251  assert(!Subtarget->isTargetEnvMacho());
11252
11253  // The lowering is pretty easy: we're just emitting the call to _alloca.  The
11254  // non-trivial part is impdef of ESP.
11255
11256  if (Subtarget->isTargetWin64()) {
11257    if (Subtarget->isTargetCygMing()) {
11258      // ___chkstk(Mingw64):
11259      // Clobbers R10, R11, RAX and EFLAGS.
11260      // Updates RSP.
11261      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
11262        .addExternalSymbol("___chkstk")
11263        .addReg(X86::RAX, RegState::Implicit)
11264        .addReg(X86::RSP, RegState::Implicit)
11265        .addReg(X86::RAX, RegState::Define | RegState::Implicit)
11266        .addReg(X86::RSP, RegState::Define | RegState::Implicit)
11267        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
11268    } else {
11269      // __chkstk(MSVCRT): does not update stack pointer.
11270      // Clobbers R10, R11 and EFLAGS.
11271      // FIXME: RAX(allocated size) might be reused and not killed.
11272      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
11273        .addExternalSymbol("__chkstk")
11274        .addReg(X86::RAX, RegState::Implicit)
11275        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
11276      // RAX has the offset to subtracted from RSP.
11277      BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
11278        .addReg(X86::RSP)
11279        .addReg(X86::RAX);
11280    }
11281  } else {
11282    const char *StackProbeSymbol =
11283      Subtarget->isTargetWindows() ? "_chkstk" : "_alloca";
11284
11285    BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
11286      .addExternalSymbol(StackProbeSymbol)
11287      .addReg(X86::EAX, RegState::Implicit)
11288      .addReg(X86::ESP, RegState::Implicit)
11289      .addReg(X86::EAX, RegState::Define | RegState::Implicit)
11290      .addReg(X86::ESP, RegState::Define | RegState::Implicit)
11291      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
11292  }
11293
11294  MI->eraseFromParent();   // The pseudo instruction is gone now.
11295  return BB;
11296}
11297
11298MachineBasicBlock *
11299X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
11300                                      MachineBasicBlock *BB) const {
11301  // This is pretty easy.  We're taking the value that we received from
11302  // our load from the relocation, sticking it in either RDI (x86-64)
11303  // or EAX and doing an indirect call.  The return value will then
11304  // be in the normal return register.
11305  const X86InstrInfo *TII
11306    = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
11307  DebugLoc DL = MI->getDebugLoc();
11308  MachineFunction *F = BB->getParent();
11309
11310  assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
11311  assert(MI->getOperand(3).isGlobal() && "This should be a global");
11312
11313  if (Subtarget->is64Bit()) {
11314    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
11315                                      TII->get(X86::MOV64rm), X86::RDI)
11316    .addReg(X86::RIP)
11317    .addImm(0).addReg(0)
11318    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
11319                      MI->getOperand(3).getTargetFlags())
11320    .addReg(0);
11321    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
11322    addDirectMem(MIB, X86::RDI);
11323  } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
11324    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
11325                                      TII->get(X86::MOV32rm), X86::EAX)
11326    .addReg(0)
11327    .addImm(0).addReg(0)
11328    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
11329                      MI->getOperand(3).getTargetFlags())
11330    .addReg(0);
11331    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
11332    addDirectMem(MIB, X86::EAX);
11333  } else {
11334    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
11335                                      TII->get(X86::MOV32rm), X86::EAX)
11336    .addReg(TII->getGlobalBaseReg(F))
11337    .addImm(0).addReg(0)
11338    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
11339                      MI->getOperand(3).getTargetFlags())
11340    .addReg(0);
11341    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
11342    addDirectMem(MIB, X86::EAX);
11343  }
11344
11345  MI->eraseFromParent(); // The pseudo instruction is gone now.
11346  return BB;
11347}
11348
11349MachineBasicBlock *
11350X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
11351                                               MachineBasicBlock *BB) const {
11352  switch (MI->getOpcode()) {
11353  default: assert(false && "Unexpected instr type to insert");
11354  case X86::TAILJMPd64:
11355  case X86::TAILJMPr64:
11356  case X86::TAILJMPm64:
11357    assert(!"TAILJMP64 would not be touched here.");
11358  case X86::TCRETURNdi64:
11359  case X86::TCRETURNri64:
11360  case X86::TCRETURNmi64:
11361    // Defs of TCRETURNxx64 has Win64's callee-saved registers, as subset.
11362    // On AMD64, additional defs should be added before register allocation.
11363    if (!Subtarget->isTargetWin64()) {
11364      MI->addRegisterDefined(X86::RSI);
11365      MI->addRegisterDefined(X86::RDI);
11366      MI->addRegisterDefined(X86::XMM6);
11367      MI->addRegisterDefined(X86::XMM7);
11368      MI->addRegisterDefined(X86::XMM8);
11369      MI->addRegisterDefined(X86::XMM9);
11370      MI->addRegisterDefined(X86::XMM10);
11371      MI->addRegisterDefined(X86::XMM11);
11372      MI->addRegisterDefined(X86::XMM12);
11373      MI->addRegisterDefined(X86::XMM13);
11374      MI->addRegisterDefined(X86::XMM14);
11375      MI->addRegisterDefined(X86::XMM15);
11376    }
11377    return BB;
11378  case X86::WIN_ALLOCA:
11379    return EmitLoweredWinAlloca(MI, BB);
11380  case X86::TLSCall_32:
11381  case X86::TLSCall_64:
11382    return EmitLoweredTLSCall(MI, BB);
11383  case X86::CMOV_GR8:
11384  case X86::CMOV_FR32:
11385  case X86::CMOV_FR64:
11386  case X86::CMOV_V4F32:
11387  case X86::CMOV_V2F64:
11388  case X86::CMOV_V2I64:
11389  case X86::CMOV_V8F32:
11390  case X86::CMOV_V4F64:
11391  case X86::CMOV_V4I64:
11392  case X86::CMOV_GR16:
11393  case X86::CMOV_GR32:
11394  case X86::CMOV_RFP32:
11395  case X86::CMOV_RFP64:
11396  case X86::CMOV_RFP80:
11397    return EmitLoweredSelect(MI, BB);
11398
11399  case X86::FP32_TO_INT16_IN_MEM:
11400  case X86::FP32_TO_INT32_IN_MEM:
11401  case X86::FP32_TO_INT64_IN_MEM:
11402  case X86::FP64_TO_INT16_IN_MEM:
11403  case X86::FP64_TO_INT32_IN_MEM:
11404  case X86::FP64_TO_INT64_IN_MEM:
11405  case X86::FP80_TO_INT16_IN_MEM:
11406  case X86::FP80_TO_INT32_IN_MEM:
11407  case X86::FP80_TO_INT64_IN_MEM: {
11408    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
11409    DebugLoc DL = MI->getDebugLoc();
11410
11411    // Change the floating point control register to use "round towards zero"
11412    // mode when truncating to an integer value.
11413    MachineFunction *F = BB->getParent();
11414    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
11415    addFrameReference(BuildMI(*BB, MI, DL,
11416                              TII->get(X86::FNSTCW16m)), CWFrameIdx);
11417
11418    // Load the old value of the high byte of the control word...
11419    unsigned OldCW =
11420      F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
11421    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
11422                      CWFrameIdx);
11423
11424    // Set the high part to be round to zero...
11425    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
11426      .addImm(0xC7F);
11427
11428    // Reload the modified control word now...
11429    addFrameReference(BuildMI(*BB, MI, DL,
11430                              TII->get(X86::FLDCW16m)), CWFrameIdx);
11431
11432    // Restore the memory image of control word to original value
11433    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
11434      .addReg(OldCW);
11435
11436    // Get the X86 opcode to use.
11437    unsigned Opc;
11438    switch (MI->getOpcode()) {
11439    default: llvm_unreachable("illegal opcode!");
11440    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
11441    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
11442    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
11443    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
11444    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
11445    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
11446    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
11447    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
11448    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
11449    }
11450
11451    X86AddressMode AM;
11452    MachineOperand &Op = MI->getOperand(0);
11453    if (Op.isReg()) {
11454      AM.BaseType = X86AddressMode::RegBase;
11455      AM.Base.Reg = Op.getReg();
11456    } else {
11457      AM.BaseType = X86AddressMode::FrameIndexBase;
11458      AM.Base.FrameIndex = Op.getIndex();
11459    }
11460    Op = MI->getOperand(1);
11461    if (Op.isImm())
11462      AM.Scale = Op.getImm();
11463    Op = MI->getOperand(2);
11464    if (Op.isImm())
11465      AM.IndexReg = Op.getImm();
11466    Op = MI->getOperand(3);
11467    if (Op.isGlobal()) {
11468      AM.GV = Op.getGlobal();
11469    } else {
11470      AM.Disp = Op.getImm();
11471    }
11472    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
11473                      .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
11474
11475    // Reload the original control word now.
11476    addFrameReference(BuildMI(*BB, MI, DL,
11477                              TII->get(X86::FLDCW16m)), CWFrameIdx);
11478
11479    MI->eraseFromParent();   // The pseudo instruction is gone now.
11480    return BB;
11481  }
11482    // String/text processing lowering.
11483  case X86::PCMPISTRM128REG:
11484  case X86::VPCMPISTRM128REG:
11485    return EmitPCMP(MI, BB, 3, false /* in-mem */);
11486  case X86::PCMPISTRM128MEM:
11487  case X86::VPCMPISTRM128MEM:
11488    return EmitPCMP(MI, BB, 3, true /* in-mem */);
11489  case X86::PCMPESTRM128REG:
11490  case X86::VPCMPESTRM128REG:
11491    return EmitPCMP(MI, BB, 5, false /* in mem */);
11492  case X86::PCMPESTRM128MEM:
11493  case X86::VPCMPESTRM128MEM:
11494    return EmitPCMP(MI, BB, 5, true /* in mem */);
11495
11496    // Thread synchronization.
11497  case X86::MONITOR:
11498    return EmitMonitor(MI, BB);
11499  case X86::MWAIT:
11500    return EmitMwait(MI, BB);
11501
11502    // Atomic Lowering.
11503  case X86::ATOMAND32:
11504    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
11505                                               X86::AND32ri, X86::MOV32rm,
11506                                               X86::LCMPXCHG32,
11507                                               X86::NOT32r, X86::EAX,
11508                                               X86::GR32RegisterClass);
11509  case X86::ATOMOR32:
11510    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
11511                                               X86::OR32ri, X86::MOV32rm,
11512                                               X86::LCMPXCHG32,
11513                                               X86::NOT32r, X86::EAX,
11514                                               X86::GR32RegisterClass);
11515  case X86::ATOMXOR32:
11516    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
11517                                               X86::XOR32ri, X86::MOV32rm,
11518                                               X86::LCMPXCHG32,
11519                                               X86::NOT32r, X86::EAX,
11520                                               X86::GR32RegisterClass);
11521  case X86::ATOMNAND32:
11522    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
11523                                               X86::AND32ri, X86::MOV32rm,
11524                                               X86::LCMPXCHG32,
11525                                               X86::NOT32r, X86::EAX,
11526                                               X86::GR32RegisterClass, true);
11527  case X86::ATOMMIN32:
11528    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
11529  case X86::ATOMMAX32:
11530    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
11531  case X86::ATOMUMIN32:
11532    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
11533  case X86::ATOMUMAX32:
11534    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
11535
11536  case X86::ATOMAND16:
11537    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
11538                                               X86::AND16ri, X86::MOV16rm,
11539                                               X86::LCMPXCHG16,
11540                                               X86::NOT16r, X86::AX,
11541                                               X86::GR16RegisterClass);
11542  case X86::ATOMOR16:
11543    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
11544                                               X86::OR16ri, X86::MOV16rm,
11545                                               X86::LCMPXCHG16,
11546                                               X86::NOT16r, X86::AX,
11547                                               X86::GR16RegisterClass);
11548  case X86::ATOMXOR16:
11549    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
11550                                               X86::XOR16ri, X86::MOV16rm,
11551                                               X86::LCMPXCHG16,
11552                                               X86::NOT16r, X86::AX,
11553                                               X86::GR16RegisterClass);
11554  case X86::ATOMNAND16:
11555    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
11556                                               X86::AND16ri, X86::MOV16rm,
11557                                               X86::LCMPXCHG16,
11558                                               X86::NOT16r, X86::AX,
11559                                               X86::GR16RegisterClass, true);
11560  case X86::ATOMMIN16:
11561    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
11562  case X86::ATOMMAX16:
11563    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
11564  case X86::ATOMUMIN16:
11565    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
11566  case X86::ATOMUMAX16:
11567    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
11568
11569  case X86::ATOMAND8:
11570    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
11571                                               X86::AND8ri, X86::MOV8rm,
11572                                               X86::LCMPXCHG8,
11573                                               X86::NOT8r, X86::AL,
11574                                               X86::GR8RegisterClass);
11575  case X86::ATOMOR8:
11576    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
11577                                               X86::OR8ri, X86::MOV8rm,
11578                                               X86::LCMPXCHG8,
11579                                               X86::NOT8r, X86::AL,
11580                                               X86::GR8RegisterClass);
11581  case X86::ATOMXOR8:
11582    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
11583                                               X86::XOR8ri, X86::MOV8rm,
11584                                               X86::LCMPXCHG8,
11585                                               X86::NOT8r, X86::AL,
11586                                               X86::GR8RegisterClass);
11587  case X86::ATOMNAND8:
11588    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
11589                                               X86::AND8ri, X86::MOV8rm,
11590                                               X86::LCMPXCHG8,
11591                                               X86::NOT8r, X86::AL,
11592                                               X86::GR8RegisterClass, true);
11593  // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
11594  // This group is for 64-bit host.
11595  case X86::ATOMAND64:
11596    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
11597                                               X86::AND64ri32, X86::MOV64rm,
11598                                               X86::LCMPXCHG64,
11599                                               X86::NOT64r, X86::RAX,
11600                                               X86::GR64RegisterClass);
11601  case X86::ATOMOR64:
11602    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
11603                                               X86::OR64ri32, X86::MOV64rm,
11604                                               X86::LCMPXCHG64,
11605                                               X86::NOT64r, X86::RAX,
11606                                               X86::GR64RegisterClass);
11607  case X86::ATOMXOR64:
11608    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
11609                                               X86::XOR64ri32, X86::MOV64rm,
11610                                               X86::LCMPXCHG64,
11611                                               X86::NOT64r, X86::RAX,
11612                                               X86::GR64RegisterClass);
11613  case X86::ATOMNAND64:
11614    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
11615                                               X86::AND64ri32, X86::MOV64rm,
11616                                               X86::LCMPXCHG64,
11617                                               X86::NOT64r, X86::RAX,
11618                                               X86::GR64RegisterClass, true);
11619  case X86::ATOMMIN64:
11620    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
11621  case X86::ATOMMAX64:
11622    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
11623  case X86::ATOMUMIN64:
11624    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
11625  case X86::ATOMUMAX64:
11626    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
11627
11628  // This group does 64-bit operations on a 32-bit host.
11629  case X86::ATOMAND6432:
11630    return EmitAtomicBit6432WithCustomInserter(MI, BB,
11631                                               X86::AND32rr, X86::AND32rr,
11632                                               X86::AND32ri, X86::AND32ri,
11633                                               false);
11634  case X86::ATOMOR6432:
11635    return EmitAtomicBit6432WithCustomInserter(MI, BB,
11636                                               X86::OR32rr, X86::OR32rr,
11637                                               X86::OR32ri, X86::OR32ri,
11638                                               false);
11639  case X86::ATOMXOR6432:
11640    return EmitAtomicBit6432WithCustomInserter(MI, BB,
11641                                               X86::XOR32rr, X86::XOR32rr,
11642                                               X86::XOR32ri, X86::XOR32ri,
11643                                               false);
11644  case X86::ATOMNAND6432:
11645    return EmitAtomicBit6432WithCustomInserter(MI, BB,
11646                                               X86::AND32rr, X86::AND32rr,
11647                                               X86::AND32ri, X86::AND32ri,
11648                                               true);
11649  case X86::ATOMADD6432:
11650    return EmitAtomicBit6432WithCustomInserter(MI, BB,
11651                                               X86::ADD32rr, X86::ADC32rr,
11652                                               X86::ADD32ri, X86::ADC32ri,
11653                                               false);
11654  case X86::ATOMSUB6432:
11655    return EmitAtomicBit6432WithCustomInserter(MI, BB,
11656                                               X86::SUB32rr, X86::SBB32rr,
11657                                               X86::SUB32ri, X86::SBB32ri,
11658                                               false);
11659  case X86::ATOMSWAP6432:
11660    return EmitAtomicBit6432WithCustomInserter(MI, BB,
11661                                               X86::MOV32rr, X86::MOV32rr,
11662                                               X86::MOV32ri, X86::MOV32ri,
11663                                               false);
11664  case X86::VASTART_SAVE_XMM_REGS:
11665    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
11666
11667  case X86::VAARG_64:
11668    return EmitVAARG64WithCustomInserter(MI, BB);
11669  }
11670}
11671
11672//===----------------------------------------------------------------------===//
11673//                           X86 Optimization Hooks
11674//===----------------------------------------------------------------------===//
11675
11676void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
11677                                                       const APInt &Mask,
11678                                                       APInt &KnownZero,
11679                                                       APInt &KnownOne,
11680                                                       const SelectionDAG &DAG,
11681                                                       unsigned Depth) const {
11682  unsigned Opc = Op.getOpcode();
11683  assert((Opc >= ISD::BUILTIN_OP_END ||
11684          Opc == ISD::INTRINSIC_WO_CHAIN ||
11685          Opc == ISD::INTRINSIC_W_CHAIN ||
11686          Opc == ISD::INTRINSIC_VOID) &&
11687         "Should use MaskedValueIsZero if you don't know whether Op"
11688         " is a target node!");
11689
11690  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);   // Don't know anything.
11691  switch (Opc) {
11692  default: break;
11693  case X86ISD::ADD:
11694  case X86ISD::SUB:
11695  case X86ISD::ADC:
11696  case X86ISD::SBB:
11697  case X86ISD::SMUL:
11698  case X86ISD::UMUL:
11699  case X86ISD::INC:
11700  case X86ISD::DEC:
11701  case X86ISD::OR:
11702  case X86ISD::XOR:
11703  case X86ISD::AND:
11704    // These nodes' second result is a boolean.
11705    if (Op.getResNo() == 0)
11706      break;
11707    // Fallthrough
11708  case X86ISD::SETCC:
11709    KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
11710                                       Mask.getBitWidth() - 1);
11711    break;
11712  }
11713}
11714
11715unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
11716                                                         unsigned Depth) const {
11717  // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
11718  if (Op.getOpcode() == X86ISD::SETCC_CARRY)
11719    return Op.getValueType().getScalarType().getSizeInBits();
11720
11721  // Fallback case.
11722  return 1;
11723}
11724
11725/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
11726/// node is a GlobalAddress + offset.
11727bool X86TargetLowering::isGAPlusOffset(SDNode *N,
11728                                       const GlobalValue* &GA,
11729                                       int64_t &Offset) const {
11730  if (N->getOpcode() == X86ISD::Wrapper) {
11731    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
11732      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
11733      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
11734      return true;
11735    }
11736  }
11737  return TargetLowering::isGAPlusOffset(N, GA, Offset);
11738}
11739
11740/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
11741/// same as extracting the high 128-bit part of 256-bit vector and then
11742/// inserting the result into the low part of a new 256-bit vector
11743static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
11744  EVT VT = SVOp->getValueType(0);
11745  int NumElems = VT.getVectorNumElements();
11746
11747  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
11748  for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j)
11749    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
11750        SVOp->getMaskElt(j) >= 0)
11751      return false;
11752
11753  return true;
11754}
11755
11756/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
11757/// same as extracting the low 128-bit part of 256-bit vector and then
11758/// inserting the result into the high part of a new 256-bit vector
11759static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
11760  EVT VT = SVOp->getValueType(0);
11761  int NumElems = VT.getVectorNumElements();
11762
11763  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
11764  for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j)
11765    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
11766        SVOp->getMaskElt(j) >= 0)
11767      return false;
11768
11769  return true;
11770}
11771
11772/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
11773static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
11774                                        TargetLowering::DAGCombinerInfo &DCI) {
11775  DebugLoc dl = N->getDebugLoc();
11776  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
11777  SDValue V1 = SVOp->getOperand(0);
11778  SDValue V2 = SVOp->getOperand(1);
11779  EVT VT = SVOp->getValueType(0);
11780  int NumElems = VT.getVectorNumElements();
11781
11782  if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
11783      V2.getOpcode() == ISD::CONCAT_VECTORS) {
11784    //
11785    //                   0,0,0,...
11786    //                      |
11787    //    V      UNDEF    BUILD_VECTOR    UNDEF
11788    //     \      /           \           /
11789    //  CONCAT_VECTOR         CONCAT_VECTOR
11790    //         \                  /
11791    //          \                /
11792    //          RESULT: V + zero extended
11793    //
11794    if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
11795        V2.getOperand(1).getOpcode() != ISD::UNDEF ||
11796        V1.getOperand(1).getOpcode() != ISD::UNDEF)
11797      return SDValue();
11798
11799    if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
11800      return SDValue();
11801
11802    // To match the shuffle mask, the first half of the mask should
11803    // be exactly the first vector, and all the rest a splat with the
11804    // first element of the second one.
11805    for (int i = 0; i < NumElems/2; ++i)
11806      if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
11807          !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
11808        return SDValue();
11809
11810    // Emit a zeroed vector and insert the desired subvector on its
11811    // first half.
11812    SDValue Zeros = getZeroVector(VT, true /* HasSSE2 */, DAG, dl);
11813    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0),
11814                         DAG.getConstant(0, MVT::i32), DAG, dl);
11815    return DCI.CombineTo(N, InsV);
11816  }
11817
11818  //===--------------------------------------------------------------------===//
11819  // Combine some shuffles into subvector extracts and inserts:
11820  //
11821
11822  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
11823  if (isShuffleHigh128VectorInsertLow(SVOp)) {
11824    SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32),
11825                                    DAG, dl);
11826    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
11827                                      V, DAG.getConstant(0, MVT::i32), DAG, dl);
11828    return DCI.CombineTo(N, InsV);
11829  }
11830
11831  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
11832  if (isShuffleLow128VectorInsertHigh(SVOp)) {
11833    SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl);
11834    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
11835                             V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
11836    return DCI.CombineTo(N, InsV);
11837  }
11838
11839  return SDValue();
11840}
11841
11842/// PerformShuffleCombine - Performs several different shuffle combines.
11843static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
11844                                     TargetLowering::DAGCombinerInfo &DCI,
11845                                     const X86Subtarget *Subtarget) {
11846  DebugLoc dl = N->getDebugLoc();
11847  EVT VT = N->getValueType(0);
11848
11849  // Don't create instructions with illegal types after legalize types has run.
11850  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11851  if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
11852    return SDValue();
11853
11854  // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
11855  if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 &&
11856      N->getOpcode() == ISD::VECTOR_SHUFFLE)
11857    return PerformShuffleCombine256(N, DAG, DCI);
11858
11859  // Only handle 128 wide vector from here on.
11860  if (VT.getSizeInBits() != 128)
11861    return SDValue();
11862
11863  // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
11864  // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
11865  // consecutive, non-overlapping, and in the right order.
11866  SmallVector<SDValue, 16> Elts;
11867  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
11868    Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
11869
11870  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
11871}
11872
11873/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
11874/// generation and convert it from being a bunch of shuffles and extracts
11875/// to a simple store and scalar loads to extract the elements.
11876static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
11877                                                const TargetLowering &TLI) {
11878  SDValue InputVector = N->getOperand(0);
11879
11880  // Only operate on vectors of 4 elements, where the alternative shuffling
11881  // gets to be more expensive.
11882  if (InputVector.getValueType() != MVT::v4i32)
11883    return SDValue();
11884
11885  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
11886  // single use which is a sign-extend or zero-extend, and all elements are
11887  // used.
11888  SmallVector<SDNode *, 4> Uses;
11889  unsigned ExtractedElements = 0;
11890  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
11891       UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
11892    if (UI.getUse().getResNo() != InputVector.getResNo())
11893      return SDValue();
11894
11895    SDNode *Extract = *UI;
11896    if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11897      return SDValue();
11898
11899    if (Extract->getValueType(0) != MVT::i32)
11900      return SDValue();
11901    if (!Extract->hasOneUse())
11902      return SDValue();
11903    if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
11904        Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
11905      return SDValue();
11906    if (!isa<ConstantSDNode>(Extract->getOperand(1)))
11907      return SDValue();
11908
11909    // Record which element was extracted.
11910    ExtractedElements |=
11911      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
11912
11913    Uses.push_back(Extract);
11914  }
11915
11916  // If not all the elements were used, this may not be worthwhile.
11917  if (ExtractedElements != 15)
11918    return SDValue();
11919
11920  // Ok, we've now decided to do the transformation.
11921  DebugLoc dl = InputVector.getDebugLoc();
11922
11923  // Store the value to a temporary stack slot.
11924  SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
11925  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
11926                            MachinePointerInfo(), false, false, 0);
11927
11928  // Replace each use (extract) with a load of the appropriate element.
11929  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
11930       UE = Uses.end(); UI != UE; ++UI) {
11931    SDNode *Extract = *UI;
11932
11933    // cOMpute the element's address.
11934    SDValue Idx = Extract->getOperand(1);
11935    unsigned EltSize =
11936        InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
11937    uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
11938    SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
11939
11940    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
11941                                     StackPtr, OffsetVal);
11942
11943    // Load the scalar.
11944    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
11945                                     ScalarAddr, MachinePointerInfo(),
11946                                     false, false, 0);
11947
11948    // Replace the exact with the load.
11949    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
11950  }
11951
11952  // The replacement was made in place; don't return anything.
11953  return SDValue();
11954}
11955
11956/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
11957static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
11958                                    const X86Subtarget *Subtarget) {
11959  DebugLoc DL = N->getDebugLoc();
11960  SDValue Cond = N->getOperand(0);
11961  // Get the LHS/RHS of the select.
11962  SDValue LHS = N->getOperand(1);
11963  SDValue RHS = N->getOperand(2);
11964
11965  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
11966  // instructions match the semantics of the common C idiom x<y?x:y but not
11967  // x<=y?x:y, because of how they handle negative zero (which can be
11968  // ignored in unsafe-math mode).
11969  if (Subtarget->hasSSE2() &&
11970      (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
11971      Cond.getOpcode() == ISD::SETCC) {
11972    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
11973
11974    unsigned Opcode = 0;
11975    // Check for x CC y ? x : y.
11976    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
11977        DAG.isEqualTo(RHS, Cond.getOperand(1))) {
11978      switch (CC) {
11979      default: break;
11980      case ISD::SETULT:
11981        // Converting this to a min would handle NaNs incorrectly, and swapping
11982        // the operands would cause it to handle comparisons between positive
11983        // and negative zero incorrectly.
11984        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
11985          if (!UnsafeFPMath &&
11986              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
11987            break;
11988          std::swap(LHS, RHS);
11989        }
11990        Opcode = X86ISD::FMIN;
11991        break;
11992      case ISD::SETOLE:
11993        // Converting this to a min would handle comparisons between positive
11994        // and negative zero incorrectly.
11995        if (!UnsafeFPMath &&
11996            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
11997          break;
11998        Opcode = X86ISD::FMIN;
11999        break;
12000      case ISD::SETULE:
12001        // Converting this to a min would handle both negative zeros and NaNs
12002        // incorrectly, but we can swap the operands to fix both.
12003        std::swap(LHS, RHS);
12004      case ISD::SETOLT:
12005      case ISD::SETLT:
12006      case ISD::SETLE:
12007        Opcode = X86ISD::FMIN;
12008        break;
12009
12010      case ISD::SETOGE:
12011        // Converting this to a max would handle comparisons between positive
12012        // and negative zero incorrectly.
12013        if (!UnsafeFPMath &&
12014            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
12015          break;
12016        Opcode = X86ISD::FMAX;
12017        break;
12018      case ISD::SETUGT:
12019        // Converting this to a max would handle NaNs incorrectly, and swapping
12020        // the operands would cause it to handle comparisons between positive
12021        // and negative zero incorrectly.
12022        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
12023          if (!UnsafeFPMath &&
12024              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
12025            break;
12026          std::swap(LHS, RHS);
12027        }
12028        Opcode = X86ISD::FMAX;
12029        break;
12030      case ISD::SETUGE:
12031        // Converting this to a max would handle both negative zeros and NaNs
12032        // incorrectly, but we can swap the operands to fix both.
12033        std::swap(LHS, RHS);
12034      case ISD::SETOGT:
12035      case ISD::SETGT:
12036      case ISD::SETGE:
12037        Opcode = X86ISD::FMAX;
12038        break;
12039      }
12040    // Check for x CC y ? y : x -- a min/max with reversed arms.
12041    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
12042               DAG.isEqualTo(RHS, Cond.getOperand(0))) {
12043      switch (CC) {
12044      default: break;
12045      case ISD::SETOGE:
12046        // Converting this to a min would handle comparisons between positive
12047        // and negative zero incorrectly, and swapping the operands would
12048        // cause it to handle NaNs incorrectly.
12049        if (!UnsafeFPMath &&
12050            !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
12051          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
12052            break;
12053          std::swap(LHS, RHS);
12054        }
12055        Opcode = X86ISD::FMIN;
12056        break;
12057      case ISD::SETUGT:
12058        // Converting this to a min would handle NaNs incorrectly.
12059        if (!UnsafeFPMath &&
12060            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
12061          break;
12062        Opcode = X86ISD::FMIN;
12063        break;
12064      case ISD::SETUGE:
12065        // Converting this to a min would handle both negative zeros and NaNs
12066        // incorrectly, but we can swap the operands to fix both.
12067        std::swap(LHS, RHS);
12068      case ISD::SETOGT:
12069      case ISD::SETGT:
12070      case ISD::SETGE:
12071        Opcode = X86ISD::FMIN;
12072        break;
12073
12074      case ISD::SETULT:
12075        // Converting this to a max would handle NaNs incorrectly.
12076        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
12077          break;
12078        Opcode = X86ISD::FMAX;
12079        break;
12080      case ISD::SETOLE:
12081        // Converting this to a max would handle comparisons between positive
12082        // and negative zero incorrectly, and swapping the operands would
12083        // cause it to handle NaNs incorrectly.
12084        if (!UnsafeFPMath &&
12085            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
12086          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
12087            break;
12088          std::swap(LHS, RHS);
12089        }
12090        Opcode = X86ISD::FMAX;
12091        break;
12092      case ISD::SETULE:
12093        // Converting this to a max would handle both negative zeros and NaNs
12094        // incorrectly, but we can swap the operands to fix both.
12095        std::swap(LHS, RHS);
12096      case ISD::SETOLT:
12097      case ISD::SETLT:
12098      case ISD::SETLE:
12099        Opcode = X86ISD::FMAX;
12100        break;
12101      }
12102    }
12103
12104    if (Opcode)
12105      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
12106  }
12107
12108  // If this is a select between two integer constants, try to do some
12109  // optimizations.
12110  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
12111    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
12112      // Don't do this for crazy integer types.
12113      if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
12114        // If this is efficiently invertible, canonicalize the LHSC/RHSC values
12115        // so that TrueC (the true value) is larger than FalseC.
12116        bool NeedsCondInvert = false;
12117
12118        if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
12119            // Efficiently invertible.
12120            (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
12121             (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
12122              isa<ConstantSDNode>(Cond.getOperand(1))))) {
12123          NeedsCondInvert = true;
12124          std::swap(TrueC, FalseC);
12125        }
12126
12127        // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
12128        if (FalseC->getAPIntValue() == 0 &&
12129            TrueC->getAPIntValue().isPowerOf2()) {
12130          if (NeedsCondInvert) // Invert the condition if needed.
12131            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
12132                               DAG.getConstant(1, Cond.getValueType()));
12133
12134          // Zero extend the condition if needed.
12135          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
12136
12137          unsigned ShAmt = TrueC->getAPIntValue().logBase2();
12138          return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
12139                             DAG.getConstant(ShAmt, MVT::i8));
12140        }
12141
12142        // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
12143        if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
12144          if (NeedsCondInvert) // Invert the condition if needed.
12145            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
12146                               DAG.getConstant(1, Cond.getValueType()));
12147
12148          // Zero extend the condition if needed.
12149          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
12150                             FalseC->getValueType(0), Cond);
12151          return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
12152                             SDValue(FalseC, 0));
12153        }
12154
12155        // Optimize cases that will turn into an LEA instruction.  This requires
12156        // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
12157        if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
12158          uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
12159          if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
12160
12161          bool isFastMultiplier = false;
12162          if (Diff < 10) {
12163            switch ((unsigned char)Diff) {
12164              default: break;
12165              case 1:  // result = add base, cond
12166              case 2:  // result = lea base(    , cond*2)
12167              case 3:  // result = lea base(cond, cond*2)
12168              case 4:  // result = lea base(    , cond*4)
12169              case 5:  // result = lea base(cond, cond*4)
12170              case 8:  // result = lea base(    , cond*8)
12171              case 9:  // result = lea base(cond, cond*8)
12172                isFastMultiplier = true;
12173                break;
12174            }
12175          }
12176
12177          if (isFastMultiplier) {
12178            APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
12179            if (NeedsCondInvert) // Invert the condition if needed.
12180              Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
12181                                 DAG.getConstant(1, Cond.getValueType()));
12182
12183            // Zero extend the condition if needed.
12184            Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
12185                               Cond);
12186            // Scale the condition by the difference.
12187            if (Diff != 1)
12188              Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
12189                                 DAG.getConstant(Diff, Cond.getValueType()));
12190
12191            // Add the base if non-zero.
12192            if (FalseC->getAPIntValue() != 0)
12193              Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
12194                                 SDValue(FalseC, 0));
12195            return Cond;
12196          }
12197        }
12198      }
12199  }
12200
12201  return SDValue();
12202}
12203
12204/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
12205static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
12206                                  TargetLowering::DAGCombinerInfo &DCI) {
12207  DebugLoc DL = N->getDebugLoc();
12208
12209  // If the flag operand isn't dead, don't touch this CMOV.
12210  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
12211    return SDValue();
12212
12213  SDValue FalseOp = N->getOperand(0);
12214  SDValue TrueOp = N->getOperand(1);
12215  X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
12216  SDValue Cond = N->getOperand(3);
12217  if (CC == X86::COND_E || CC == X86::COND_NE) {
12218    switch (Cond.getOpcode()) {
12219    default: break;
12220    case X86ISD::BSR:
12221    case X86ISD::BSF:
12222      // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
12223      if (DAG.isKnownNeverZero(Cond.getOperand(0)))
12224        return (CC == X86::COND_E) ? FalseOp : TrueOp;
12225    }
12226  }
12227
12228  // If this is a select between two integer constants, try to do some
12229  // optimizations.  Note that the operands are ordered the opposite of SELECT
12230  // operands.
12231  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
12232    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
12233      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
12234      // larger than FalseC (the false value).
12235      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
12236        CC = X86::GetOppositeBranchCondition(CC);
12237        std::swap(TrueC, FalseC);
12238      }
12239
12240      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
12241      // This is efficient for any integer data type (including i8/i16) and
12242      // shift amount.
12243      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
12244        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
12245                           DAG.getConstant(CC, MVT::i8), Cond);
12246
12247        // Zero extend the condition if needed.
12248        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
12249
12250        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
12251        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
12252                           DAG.getConstant(ShAmt, MVT::i8));
12253        if (N->getNumValues() == 2)  // Dead flag value?
12254          return DCI.CombineTo(N, Cond, SDValue());
12255        return Cond;
12256      }
12257
12258      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
12259      // for any integer data type, including i8/i16.
12260      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
12261        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
12262                           DAG.getConstant(CC, MVT::i8), Cond);
12263
12264        // Zero extend the condition if needed.
12265        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
12266                           FalseC->getValueType(0), Cond);
12267        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
12268                           SDValue(FalseC, 0));
12269
12270        if (N->getNumValues() == 2)  // Dead flag value?
12271          return DCI.CombineTo(N, Cond, SDValue());
12272        return Cond;
12273      }
12274
12275      // Optimize cases that will turn into an LEA instruction.  This requires
12276      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
12277      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
12278        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
12279        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
12280
12281        bool isFastMultiplier = false;
12282        if (Diff < 10) {
12283          switch ((unsigned char)Diff) {
12284          default: break;
12285          case 1:  // result = add base, cond
12286          case 2:  // result = lea base(    , cond*2)
12287          case 3:  // result = lea base(cond, cond*2)
12288          case 4:  // result = lea base(    , cond*4)
12289          case 5:  // result = lea base(cond, cond*4)
12290          case 8:  // result = lea base(    , cond*8)
12291          case 9:  // result = lea base(cond, cond*8)
12292            isFastMultiplier = true;
12293            break;
12294          }
12295        }
12296
12297        if (isFastMultiplier) {
12298          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
12299          Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
12300                             DAG.getConstant(CC, MVT::i8), Cond);
12301          // Zero extend the condition if needed.
12302          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
12303                             Cond);
12304          // Scale the condition by the difference.
12305          if (Diff != 1)
12306            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
12307                               DAG.getConstant(Diff, Cond.getValueType()));
12308
12309          // Add the base if non-zero.
12310          if (FalseC->getAPIntValue() != 0)
12311            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
12312                               SDValue(FalseC, 0));
12313          if (N->getNumValues() == 2)  // Dead flag value?
12314            return DCI.CombineTo(N, Cond, SDValue());
12315          return Cond;
12316        }
12317      }
12318    }
12319  }
12320  return SDValue();
12321}
12322
12323
12324/// PerformMulCombine - Optimize a single multiply with constant into two
12325/// in order to implement it with two cheaper instructions, e.g.
12326/// LEA + SHL, LEA + LEA.
12327static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
12328                                 TargetLowering::DAGCombinerInfo &DCI) {
12329  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
12330    return SDValue();
12331
12332  EVT VT = N->getValueType(0);
12333  if (VT != MVT::i64)
12334    return SDValue();
12335
12336  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
12337  if (!C)
12338    return SDValue();
12339  uint64_t MulAmt = C->getZExtValue();
12340  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
12341    return SDValue();
12342
12343  uint64_t MulAmt1 = 0;
12344  uint64_t MulAmt2 = 0;
12345  if ((MulAmt % 9) == 0) {
12346    MulAmt1 = 9;
12347    MulAmt2 = MulAmt / 9;
12348  } else if ((MulAmt % 5) == 0) {
12349    MulAmt1 = 5;
12350    MulAmt2 = MulAmt / 5;
12351  } else if ((MulAmt % 3) == 0) {
12352    MulAmt1 = 3;
12353    MulAmt2 = MulAmt / 3;
12354  }
12355  if (MulAmt2 &&
12356      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
12357    DebugLoc DL = N->getDebugLoc();
12358
12359    if (isPowerOf2_64(MulAmt2) &&
12360        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
12361      // If second multiplifer is pow2, issue it first. We want the multiply by
12362      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
12363      // is an add.
12364      std::swap(MulAmt1, MulAmt2);
12365
12366    SDValue NewMul;
12367    if (isPowerOf2_64(MulAmt1))
12368      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
12369                           DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
12370    else
12371      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
12372                           DAG.getConstant(MulAmt1, VT));
12373
12374    if (isPowerOf2_64(MulAmt2))
12375      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
12376                           DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
12377    else
12378      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
12379                           DAG.getConstant(MulAmt2, VT));
12380
12381    // Do not add new nodes to DAG combiner worklist.
12382    DCI.CombineTo(N, NewMul, false);
12383  }
12384  return SDValue();
12385}
12386
12387static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
12388  SDValue N0 = N->getOperand(0);
12389  SDValue N1 = N->getOperand(1);
12390  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
12391  EVT VT = N0.getValueType();
12392
12393  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
12394  // since the result of setcc_c is all zero's or all ones.
12395  if (N1C && N0.getOpcode() == ISD::AND &&
12396      N0.getOperand(1).getOpcode() == ISD::Constant) {
12397    SDValue N00 = N0.getOperand(0);
12398    if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
12399        ((N00.getOpcode() == ISD::ANY_EXTEND ||
12400          N00.getOpcode() == ISD::ZERO_EXTEND) &&
12401         N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
12402      APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
12403      APInt ShAmt = N1C->getAPIntValue();
12404      Mask = Mask.shl(ShAmt);
12405      if (Mask != 0)
12406        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
12407                           N00, DAG.getConstant(Mask, VT));
12408    }
12409  }
12410
12411  return SDValue();
12412}
12413
12414/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
12415///                       when possible.
12416static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
12417                                   const X86Subtarget *Subtarget) {
12418  EVT VT = N->getValueType(0);
12419  if (!VT.isVector() && VT.isInteger() &&
12420      N->getOpcode() == ISD::SHL)
12421    return PerformSHLCombine(N, DAG);
12422
12423  // On X86 with SSE2 support, we can transform this to a vector shift if
12424  // all elements are shifted by the same amount.  We can't do this in legalize
12425  // because the a constant vector is typically transformed to a constant pool
12426  // so we have no knowledge of the shift amount.
12427  if (!(Subtarget->hasSSE2() || Subtarget->hasAVX()))
12428    return SDValue();
12429
12430  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
12431    return SDValue();
12432
12433  SDValue ShAmtOp = N->getOperand(1);
12434  EVT EltVT = VT.getVectorElementType();
12435  DebugLoc DL = N->getDebugLoc();
12436  SDValue BaseShAmt = SDValue();
12437  if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
12438    unsigned NumElts = VT.getVectorNumElements();
12439    unsigned i = 0;
12440    for (; i != NumElts; ++i) {
12441      SDValue Arg = ShAmtOp.getOperand(i);
12442      if (Arg.getOpcode() == ISD::UNDEF) continue;
12443      BaseShAmt = Arg;
12444      break;
12445    }
12446    for (; i != NumElts; ++i) {
12447      SDValue Arg = ShAmtOp.getOperand(i);
12448      if (Arg.getOpcode() == ISD::UNDEF) continue;
12449      if (Arg != BaseShAmt) {
12450        return SDValue();
12451      }
12452    }
12453  } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
12454             cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
12455    SDValue InVec = ShAmtOp.getOperand(0);
12456    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
12457      unsigned NumElts = InVec.getValueType().getVectorNumElements();
12458      unsigned i = 0;
12459      for (; i != NumElts; ++i) {
12460        SDValue Arg = InVec.getOperand(i);
12461        if (Arg.getOpcode() == ISD::UNDEF) continue;
12462        BaseShAmt = Arg;
12463        break;
12464      }
12465    } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
12466       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
12467         unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
12468         if (C->getZExtValue() == SplatIdx)
12469           BaseShAmt = InVec.getOperand(1);
12470       }
12471    }
12472    if (BaseShAmt.getNode() == 0)
12473      BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
12474                              DAG.getIntPtrConstant(0));
12475  } else
12476    return SDValue();
12477
12478  // The shift amount is an i32.
12479  if (EltVT.bitsGT(MVT::i32))
12480    BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
12481  else if (EltVT.bitsLT(MVT::i32))
12482    BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt);
12483
12484  // The shift amount is identical so we can do a vector shift.
12485  SDValue  ValOp = N->getOperand(0);
12486  switch (N->getOpcode()) {
12487  default:
12488    llvm_unreachable("Unknown shift opcode!");
12489    break;
12490  case ISD::SHL:
12491    if (VT == MVT::v2i64)
12492      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
12493                         DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
12494                         ValOp, BaseShAmt);
12495    if (VT == MVT::v4i32)
12496      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
12497                         DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
12498                         ValOp, BaseShAmt);
12499    if (VT == MVT::v8i16)
12500      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
12501                         DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
12502                         ValOp, BaseShAmt);
12503    break;
12504  case ISD::SRA:
12505    if (VT == MVT::v4i32)
12506      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
12507                         DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
12508                         ValOp, BaseShAmt);
12509    if (VT == MVT::v8i16)
12510      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
12511                         DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
12512                         ValOp, BaseShAmt);
12513    break;
12514  case ISD::SRL:
12515    if (VT == MVT::v2i64)
12516      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
12517                         DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
12518                         ValOp, BaseShAmt);
12519    if (VT == MVT::v4i32)
12520      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
12521                         DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
12522                         ValOp, BaseShAmt);
12523    if (VT ==  MVT::v8i16)
12524      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
12525                         DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
12526                         ValOp, BaseShAmt);
12527    break;
12528  }
12529  return SDValue();
12530}
12531
12532
12533// CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
12534// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
12535// and friends.  Likewise for OR -> CMPNEQSS.
12536static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
12537                            TargetLowering::DAGCombinerInfo &DCI,
12538                            const X86Subtarget *Subtarget) {
12539  unsigned opcode;
12540
12541  // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
12542  // we're requiring SSE2 for both.
12543  if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
12544    SDValue N0 = N->getOperand(0);
12545    SDValue N1 = N->getOperand(1);
12546    SDValue CMP0 = N0->getOperand(1);
12547    SDValue CMP1 = N1->getOperand(1);
12548    DebugLoc DL = N->getDebugLoc();
12549
12550    // The SETCCs should both refer to the same CMP.
12551    if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
12552      return SDValue();
12553
12554    SDValue CMP00 = CMP0->getOperand(0);
12555    SDValue CMP01 = CMP0->getOperand(1);
12556    EVT     VT    = CMP00.getValueType();
12557
12558    if (VT == MVT::f32 || VT == MVT::f64) {
12559      bool ExpectingFlags = false;
12560      // Check for any users that want flags:
12561      for (SDNode::use_iterator UI = N->use_begin(),
12562             UE = N->use_end();
12563           !ExpectingFlags && UI != UE; ++UI)
12564        switch (UI->getOpcode()) {
12565        default:
12566        case ISD::BR_CC:
12567        case ISD::BRCOND:
12568        case ISD::SELECT:
12569          ExpectingFlags = true;
12570          break;
12571        case ISD::CopyToReg:
12572        case ISD::SIGN_EXTEND:
12573        case ISD::ZERO_EXTEND:
12574        case ISD::ANY_EXTEND:
12575          break;
12576        }
12577
12578      if (!ExpectingFlags) {
12579        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
12580        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
12581
12582        if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
12583          X86::CondCode tmp = cc0;
12584          cc0 = cc1;
12585          cc1 = tmp;
12586        }
12587
12588        if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
12589            (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
12590          bool is64BitFP = (CMP00.getValueType() == MVT::f64);
12591          X86ISD::NodeType NTOperator = is64BitFP ?
12592            X86ISD::FSETCCsd : X86ISD::FSETCCss;
12593          // FIXME: need symbolic constants for these magic numbers.
12594          // See X86ATTInstPrinter.cpp:printSSECC().
12595          unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
12596          SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01,
12597                                              DAG.getConstant(x86cc, MVT::i8));
12598          SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32,
12599                                              OnesOrZeroesF);
12600          SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI,
12601                                      DAG.getConstant(1, MVT::i32));
12602          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
12603          return OneBitOfTruth;
12604        }
12605      }
12606    }
12607  }
12608  return SDValue();
12609}
12610
12611/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
12612/// so it can be folded inside ANDNP.
12613static bool CanFoldXORWithAllOnes(const SDNode *N) {
12614  EVT VT = N->getValueType(0);
12615
12616  // Match direct AllOnes for 128 and 256-bit vectors
12617  if (ISD::isBuildVectorAllOnes(N))
12618    return true;
12619
12620  // Look through a bit convert.
12621  if (N->getOpcode() == ISD::BITCAST)
12622    N = N->getOperand(0).getNode();
12623
12624  // Sometimes the operand may come from a insert_subvector building a 256-bit
12625  // allones vector
12626  if (VT.getSizeInBits() == 256 &&
12627      N->getOpcode() == ISD::INSERT_SUBVECTOR) {
12628    SDValue V1 = N->getOperand(0);
12629    SDValue V2 = N->getOperand(1);
12630
12631    if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
12632        V1.getOperand(0).getOpcode() == ISD::UNDEF &&
12633        ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
12634        ISD::isBuildVectorAllOnes(V2.getNode()))
12635      return true;
12636  }
12637
12638  return false;
12639}
12640
12641static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
12642                                 TargetLowering::DAGCombinerInfo &DCI,
12643                                 const X86Subtarget *Subtarget) {
12644  if (DCI.isBeforeLegalizeOps())
12645    return SDValue();
12646
12647  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
12648  if (R.getNode())
12649    return R;
12650
12651  // Want to form ANDNP nodes:
12652  // 1) In the hopes of then easily combining them with OR and AND nodes
12653  //    to form PBLEND/PSIGN.
12654  // 2) To match ANDN packed intrinsics
12655  EVT VT = N->getValueType(0);
12656  if (VT != MVT::v2i64 && VT != MVT::v4i64)
12657    return SDValue();
12658
12659  SDValue N0 = N->getOperand(0);
12660  SDValue N1 = N->getOperand(1);
12661  DebugLoc DL = N->getDebugLoc();
12662
12663  // Check LHS for vnot
12664  if (N0.getOpcode() == ISD::XOR &&
12665      //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
12666      CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
12667    return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
12668
12669  // Check RHS for vnot
12670  if (N1.getOpcode() == ISD::XOR &&
12671      //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
12672      CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
12673    return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
12674
12675  return SDValue();
12676}
12677
12678static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
12679                                TargetLowering::DAGCombinerInfo &DCI,
12680                                const X86Subtarget *Subtarget) {
12681  if (DCI.isBeforeLegalizeOps())
12682    return SDValue();
12683
12684  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
12685  if (R.getNode())
12686    return R;
12687
12688  EVT VT = N->getValueType(0);
12689  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64)
12690    return SDValue();
12691
12692  SDValue N0 = N->getOperand(0);
12693  SDValue N1 = N->getOperand(1);
12694
12695  // look for psign/blend
12696  if (Subtarget->hasSSSE3()) {
12697    if (VT == MVT::v2i64) {
12698      // Canonicalize pandn to RHS
12699      if (N0.getOpcode() == X86ISD::ANDNP)
12700        std::swap(N0, N1);
12701      // or (and (m, x), (pandn m, y))
12702      if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
12703        SDValue Mask = N1.getOperand(0);
12704        SDValue X    = N1.getOperand(1);
12705        SDValue Y;
12706        if (N0.getOperand(0) == Mask)
12707          Y = N0.getOperand(1);
12708        if (N0.getOperand(1) == Mask)
12709          Y = N0.getOperand(0);
12710
12711        // Check to see if the mask appeared in both the AND and ANDNP and
12712        if (!Y.getNode())
12713          return SDValue();
12714
12715        // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
12716        if (Mask.getOpcode() != ISD::BITCAST ||
12717            X.getOpcode() != ISD::BITCAST ||
12718            Y.getOpcode() != ISD::BITCAST)
12719          return SDValue();
12720
12721        // Look through mask bitcast.
12722        Mask = Mask.getOperand(0);
12723        EVT MaskVT = Mask.getValueType();
12724
12725        // Validate that the Mask operand is a vector sra node.  The sra node
12726        // will be an intrinsic.
12727        if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
12728          return SDValue();
12729
12730        // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
12731        // there is no psrai.b
12732        switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) {
12733        case Intrinsic::x86_sse2_psrai_w:
12734        case Intrinsic::x86_sse2_psrai_d:
12735          break;
12736        default: return SDValue();
12737        }
12738
12739        // Check that the SRA is all signbits.
12740        SDValue SraC = Mask.getOperand(2);
12741        unsigned SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
12742        unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
12743        if ((SraAmt + 1) != EltBits)
12744          return SDValue();
12745
12746        DebugLoc DL = N->getDebugLoc();
12747
12748        // Now we know we at least have a plendvb with the mask val.  See if
12749        // we can form a psignb/w/d.
12750        // psign = x.type == y.type == mask.type && y = sub(0, x);
12751        X = X.getOperand(0);
12752        Y = Y.getOperand(0);
12753        if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
12754            ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
12755            X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){
12756          unsigned Opc = 0;
12757          switch (EltBits) {
12758          case 8: Opc = X86ISD::PSIGNB; break;
12759          case 16: Opc = X86ISD::PSIGNW; break;
12760          case 32: Opc = X86ISD::PSIGND; break;
12761          default: break;
12762          }
12763          if (Opc) {
12764            SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1));
12765            return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign);
12766          }
12767        }
12768        // PBLENDVB only available on SSE 4.1
12769        if (!Subtarget->hasSSE41())
12770          return SDValue();
12771
12772        X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X);
12773        Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y);
12774        Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask);
12775        Mask = DAG.getNode(X86ISD::PBLENDVB, DL, MVT::v16i8, X, Y, Mask);
12776        return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask);
12777      }
12778    }
12779  }
12780
12781  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
12782  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
12783    std::swap(N0, N1);
12784  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
12785    return SDValue();
12786  if (!N0.hasOneUse() || !N1.hasOneUse())
12787    return SDValue();
12788
12789  SDValue ShAmt0 = N0.getOperand(1);
12790  if (ShAmt0.getValueType() != MVT::i8)
12791    return SDValue();
12792  SDValue ShAmt1 = N1.getOperand(1);
12793  if (ShAmt1.getValueType() != MVT::i8)
12794    return SDValue();
12795  if (ShAmt0.getOpcode() == ISD::TRUNCATE)
12796    ShAmt0 = ShAmt0.getOperand(0);
12797  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
12798    ShAmt1 = ShAmt1.getOperand(0);
12799
12800  DebugLoc DL = N->getDebugLoc();
12801  unsigned Opc = X86ISD::SHLD;
12802  SDValue Op0 = N0.getOperand(0);
12803  SDValue Op1 = N1.getOperand(0);
12804  if (ShAmt0.getOpcode() == ISD::SUB) {
12805    Opc = X86ISD::SHRD;
12806    std::swap(Op0, Op1);
12807    std::swap(ShAmt0, ShAmt1);
12808  }
12809
12810  unsigned Bits = VT.getSizeInBits();
12811  if (ShAmt1.getOpcode() == ISD::SUB) {
12812    SDValue Sum = ShAmt1.getOperand(0);
12813    if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
12814      SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
12815      if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
12816        ShAmt1Op1 = ShAmt1Op1.getOperand(0);
12817      if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
12818        return DAG.getNode(Opc, DL, VT,
12819                           Op0, Op1,
12820                           DAG.getNode(ISD::TRUNCATE, DL,
12821                                       MVT::i8, ShAmt0));
12822    }
12823  } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
12824    ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
12825    if (ShAmt0C &&
12826        ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
12827      return DAG.getNode(Opc, DL, VT,
12828                         N0.getOperand(0), N1.getOperand(0),
12829                         DAG.getNode(ISD::TRUNCATE, DL,
12830                                       MVT::i8, ShAmt0));
12831  }
12832
12833  return SDValue();
12834}
12835
12836/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
12837static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
12838                                   const X86Subtarget *Subtarget) {
12839  StoreSDNode *St = cast<StoreSDNode>(N);
12840  EVT VT = St->getValue().getValueType();
12841  EVT StVT = St->getMemoryVT();
12842  DebugLoc dl = St->getDebugLoc();
12843  SDValue StoredVal = St->getOperand(1);
12844  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12845
12846  // If we are saving a concatination of two XMM registers, perform two stores.
12847  // This is better in Sandy Bridge cause one 256-bit mem op is done via two
12848  // 128-bit ones. If in the future the cost becomes only one memory access the
12849  // first version would be better.
12850  if (VT.getSizeInBits() == 256 &&
12851    StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
12852    StoredVal.getNumOperands() == 2) {
12853
12854    SDValue Value0 = StoredVal.getOperand(0);
12855    SDValue Value1 = StoredVal.getOperand(1);
12856
12857    SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
12858    SDValue Ptr0 = St->getBasePtr();
12859    SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
12860
12861    SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
12862                                St->getPointerInfo(), St->isVolatile(),
12863                                St->isNonTemporal(), St->getAlignment());
12864    SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
12865                                St->getPointerInfo(), St->isVolatile(),
12866                                St->isNonTemporal(), St->getAlignment());
12867    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
12868  }
12869
12870  // Optimize trunc store (of multiple scalars) to shuffle and store.
12871  // First, pack all of the elements in one place. Next, store to memory
12872  // in fewer chunks.
12873  if (St->isTruncatingStore() && VT.isVector()) {
12874    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12875    unsigned NumElems = VT.getVectorNumElements();
12876    assert(StVT != VT && "Cannot truncate to the same type");
12877    unsigned FromSz = VT.getVectorElementType().getSizeInBits();
12878    unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
12879
12880    // From, To sizes and ElemCount must be pow of two
12881    if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
12882    // We are going to use the original vector elt for storing.
12883    // accumulated smaller vector elements must be a multiple of bigger size.
12884    if (0 != (NumElems * ToSz) % FromSz) return SDValue();
12885    unsigned SizeRatio  = FromSz / ToSz;
12886
12887    assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
12888
12889    // Create a type on which we perform the shuffle
12890    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
12891            StVT.getScalarType(), NumElems*SizeRatio);
12892
12893    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
12894
12895    SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
12896    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
12897    for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio;
12898
12899    // Can't shuffle using an illegal type
12900    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
12901
12902    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
12903                                DAG.getUNDEF(WideVec.getValueType()),
12904                                ShuffleVec.data());
12905    // At this point all of the data is stored at the bottom of the
12906    // register. We now need to save it to mem.
12907
12908    // Find the largest store unit
12909    MVT StoreType = MVT::i8;
12910    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
12911         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
12912      MVT Tp = (MVT::SimpleValueType)tp;
12913      if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz)
12914        StoreType = Tp;
12915    }
12916
12917    // Bitcast the original vector into a vector of store-size units
12918    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
12919            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
12920    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
12921    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
12922    SmallVector<SDValue, 8> Chains;
12923    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
12924                                        TLI.getPointerTy());
12925    SDValue Ptr = St->getBasePtr();
12926
12927    // Perform one or more big stores into memory.
12928    for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) {
12929      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
12930                                   StoreType, ShuffWide,
12931                                   DAG.getIntPtrConstant(i));
12932      SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
12933                                St->getPointerInfo(), St->isVolatile(),
12934                                St->isNonTemporal(), St->getAlignment());
12935      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
12936      Chains.push_back(Ch);
12937    }
12938
12939    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
12940                               Chains.size());
12941  }
12942
12943
12944  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
12945  // the FP state in cases where an emms may be missing.
12946  // A preferable solution to the general problem is to figure out the right
12947  // places to insert EMMS.  This qualifies as a quick hack.
12948
12949  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
12950  if (VT.getSizeInBits() != 64)
12951    return SDValue();
12952
12953  const Function *F = DAG.getMachineFunction().getFunction();
12954  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
12955  bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
12956    && Subtarget->hasSSE2();
12957  if ((VT.isVector() ||
12958       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
12959      isa<LoadSDNode>(St->getValue()) &&
12960      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
12961      St->getChain().hasOneUse() && !St->isVolatile()) {
12962    SDNode* LdVal = St->getValue().getNode();
12963    LoadSDNode *Ld = 0;
12964    int TokenFactorIndex = -1;
12965    SmallVector<SDValue, 8> Ops;
12966    SDNode* ChainVal = St->getChain().getNode();
12967    // Must be a store of a load.  We currently handle two cases:  the load
12968    // is a direct child, and it's under an intervening TokenFactor.  It is
12969    // possible to dig deeper under nested TokenFactors.
12970    if (ChainVal == LdVal)
12971      Ld = cast<LoadSDNode>(St->getChain());
12972    else if (St->getValue().hasOneUse() &&
12973             ChainVal->getOpcode() == ISD::TokenFactor) {
12974      for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
12975        if (ChainVal->getOperand(i).getNode() == LdVal) {
12976          TokenFactorIndex = i;
12977          Ld = cast<LoadSDNode>(St->getValue());
12978        } else
12979          Ops.push_back(ChainVal->getOperand(i));
12980      }
12981    }
12982
12983    if (!Ld || !ISD::isNormalLoad(Ld))
12984      return SDValue();
12985
12986    // If this is not the MMX case, i.e. we are just turning i64 load/store
12987    // into f64 load/store, avoid the transformation if there are multiple
12988    // uses of the loaded value.
12989    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
12990      return SDValue();
12991
12992    DebugLoc LdDL = Ld->getDebugLoc();
12993    DebugLoc StDL = N->getDebugLoc();
12994    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
12995    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
12996    // pair instead.
12997    if (Subtarget->is64Bit() || F64IsLegal) {
12998      EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
12999      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
13000                                  Ld->getPointerInfo(), Ld->isVolatile(),
13001                                  Ld->isNonTemporal(), Ld->getAlignment());
13002      SDValue NewChain = NewLd.getValue(1);
13003      if (TokenFactorIndex != -1) {
13004        Ops.push_back(NewChain);
13005        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
13006                               Ops.size());
13007      }
13008      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
13009                          St->getPointerInfo(),
13010                          St->isVolatile(), St->isNonTemporal(),
13011                          St->getAlignment());
13012    }
13013
13014    // Otherwise, lower to two pairs of 32-bit loads / stores.
13015    SDValue LoAddr = Ld->getBasePtr();
13016    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
13017                                 DAG.getConstant(4, MVT::i32));
13018
13019    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
13020                               Ld->getPointerInfo(),
13021                               Ld->isVolatile(), Ld->isNonTemporal(),
13022                               Ld->getAlignment());
13023    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
13024                               Ld->getPointerInfo().getWithOffset(4),
13025                               Ld->isVolatile(), Ld->isNonTemporal(),
13026                               MinAlign(Ld->getAlignment(), 4));
13027
13028    SDValue NewChain = LoLd.getValue(1);
13029    if (TokenFactorIndex != -1) {
13030      Ops.push_back(LoLd);
13031      Ops.push_back(HiLd);
13032      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
13033                             Ops.size());
13034    }
13035
13036    LoAddr = St->getBasePtr();
13037    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
13038                         DAG.getConstant(4, MVT::i32));
13039
13040    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
13041                                St->getPointerInfo(),
13042                                St->isVolatile(), St->isNonTemporal(),
13043                                St->getAlignment());
13044    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
13045                                St->getPointerInfo().getWithOffset(4),
13046                                St->isVolatile(),
13047                                St->isNonTemporal(),
13048                                MinAlign(St->getAlignment(), 4));
13049    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
13050  }
13051  return SDValue();
13052}
13053
13054/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
13055/// X86ISD::FXOR nodes.
13056static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
13057  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
13058  // F[X]OR(0.0, x) -> x
13059  // F[X]OR(x, 0.0) -> x
13060  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
13061    if (C->getValueAPF().isPosZero())
13062      return N->getOperand(1);
13063  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
13064    if (C->getValueAPF().isPosZero())
13065      return N->getOperand(0);
13066  return SDValue();
13067}
13068
13069/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
13070static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
13071  // FAND(0.0, x) -> 0.0
13072  // FAND(x, 0.0) -> 0.0
13073  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
13074    if (C->getValueAPF().isPosZero())
13075      return N->getOperand(0);
13076  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
13077    if (C->getValueAPF().isPosZero())
13078      return N->getOperand(1);
13079  return SDValue();
13080}
13081
13082static SDValue PerformBTCombine(SDNode *N,
13083                                SelectionDAG &DAG,
13084                                TargetLowering::DAGCombinerInfo &DCI) {
13085  // BT ignores high bits in the bit index operand.
13086  SDValue Op1 = N->getOperand(1);
13087  if (Op1.hasOneUse()) {
13088    unsigned BitWidth = Op1.getValueSizeInBits();
13089    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
13090    APInt KnownZero, KnownOne;
13091    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
13092                                          !DCI.isBeforeLegalizeOps());
13093    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13094    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
13095        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
13096      DCI.CommitTargetLoweringOpt(TLO);
13097  }
13098  return SDValue();
13099}
13100
13101static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
13102  SDValue Op = N->getOperand(0);
13103  if (Op.getOpcode() == ISD::BITCAST)
13104    Op = Op.getOperand(0);
13105  EVT VT = N->getValueType(0), OpVT = Op.getValueType();
13106  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
13107      VT.getVectorElementType().getSizeInBits() ==
13108      OpVT.getVectorElementType().getSizeInBits()) {
13109    return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
13110  }
13111  return SDValue();
13112}
13113
13114static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) {
13115  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
13116  //           (and (i32 x86isd::setcc_carry), 1)
13117  // This eliminates the zext. This transformation is necessary because
13118  // ISD::SETCC is always legalized to i8.
13119  DebugLoc dl = N->getDebugLoc();
13120  SDValue N0 = N->getOperand(0);
13121  EVT VT = N->getValueType(0);
13122  if (N0.getOpcode() == ISD::AND &&
13123      N0.hasOneUse() &&
13124      N0.getOperand(0).hasOneUse()) {
13125    SDValue N00 = N0.getOperand(0);
13126    if (N00.getOpcode() != X86ISD::SETCC_CARRY)
13127      return SDValue();
13128    ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
13129    if (!C || C->getZExtValue() != 1)
13130      return SDValue();
13131    return DAG.getNode(ISD::AND, dl, VT,
13132                       DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
13133                                   N00.getOperand(0), N00.getOperand(1)),
13134                       DAG.getConstant(1, VT));
13135  }
13136
13137  return SDValue();
13138}
13139
13140// Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
13141static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
13142  unsigned X86CC = N->getConstantOperandVal(0);
13143  SDValue EFLAG = N->getOperand(1);
13144  DebugLoc DL = N->getDebugLoc();
13145
13146  // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
13147  // a zext and produces an all-ones bit which is more useful than 0/1 in some
13148  // cases.
13149  if (X86CC == X86::COND_B)
13150    return DAG.getNode(ISD::AND, DL, MVT::i8,
13151                       DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
13152                                   DAG.getConstant(X86CC, MVT::i8), EFLAG),
13153                       DAG.getConstant(1, MVT::i8));
13154
13155  return SDValue();
13156}
13157
13158static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
13159                                        const X86TargetLowering *XTLI) {
13160  SDValue Op0 = N->getOperand(0);
13161  // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
13162  // a 32-bit target where SSE doesn't support i64->FP operations.
13163  if (Op0.getOpcode() == ISD::LOAD) {
13164    LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
13165    EVT VT = Ld->getValueType(0);
13166    if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
13167        ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
13168        !XTLI->getSubtarget()->is64Bit() &&
13169        !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
13170      SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
13171                                          Ld->getChain(), Op0, DAG);
13172      DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
13173      return FILDChain;
13174    }
13175  }
13176  return SDValue();
13177}
13178
13179// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
13180static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
13181                                 X86TargetLowering::DAGCombinerInfo &DCI) {
13182  // If the LHS and RHS of the ADC node are zero, then it can't overflow and
13183  // the result is either zero or one (depending on the input carry bit).
13184  // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
13185  if (X86::isZeroNode(N->getOperand(0)) &&
13186      X86::isZeroNode(N->getOperand(1)) &&
13187      // We don't have a good way to replace an EFLAGS use, so only do this when
13188      // dead right now.
13189      SDValue(N, 1).use_empty()) {
13190    DebugLoc DL = N->getDebugLoc();
13191    EVT VT = N->getValueType(0);
13192    SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
13193    SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
13194                               DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
13195                                           DAG.getConstant(X86::COND_B,MVT::i8),
13196                                           N->getOperand(2)),
13197                               DAG.getConstant(1, VT));
13198    return DCI.CombineTo(N, Res1, CarryOut);
13199  }
13200
13201  return SDValue();
13202}
13203
13204// fold (add Y, (sete  X, 0)) -> adc  0, Y
13205//      (add Y, (setne X, 0)) -> sbb -1, Y
13206//      (sub (sete  X, 0), Y) -> sbb  0, Y
13207//      (sub (setne X, 0), Y) -> adc -1, Y
13208static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
13209  DebugLoc DL = N->getDebugLoc();
13210
13211  // Look through ZExts.
13212  SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
13213  if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
13214    return SDValue();
13215
13216  SDValue SetCC = Ext.getOperand(0);
13217  if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
13218    return SDValue();
13219
13220  X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
13221  if (CC != X86::COND_E && CC != X86::COND_NE)
13222    return SDValue();
13223
13224  SDValue Cmp = SetCC.getOperand(1);
13225  if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
13226      !X86::isZeroNode(Cmp.getOperand(1)) ||
13227      !Cmp.getOperand(0).getValueType().isInteger())
13228    return SDValue();
13229
13230  SDValue CmpOp0 = Cmp.getOperand(0);
13231  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
13232                               DAG.getConstant(1, CmpOp0.getValueType()));
13233
13234  SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
13235  if (CC == X86::COND_NE)
13236    return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
13237                       DL, OtherVal.getValueType(), OtherVal,
13238                       DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
13239  return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
13240                     DL, OtherVal.getValueType(), OtherVal,
13241                     DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
13242}
13243
13244static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) {
13245  SDValue Op0 = N->getOperand(0);
13246  SDValue Op1 = N->getOperand(1);
13247
13248  // X86 can't encode an immediate LHS of a sub. See if we can push the
13249  // negation into a preceding instruction.
13250  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
13251    uint64_t Op0C = C->getSExtValue();
13252
13253    // If the RHS of the sub is a XOR with one use and a constant, invert the
13254    // immediate. Then add one to the LHS of the sub so we can turn
13255    // X-Y -> X+~Y+1, saving one register.
13256    if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
13257        isa<ConstantSDNode>(Op1.getOperand(1))) {
13258      uint64_t XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getSExtValue();
13259      EVT VT = Op0.getValueType();
13260      SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT,
13261                                   Op1.getOperand(0),
13262                                   DAG.getConstant(~XorC, VT));
13263      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor,
13264                         DAG.getConstant(Op0C+1, VT));
13265    }
13266  }
13267
13268  return OptimizeConditionalInDecrement(N, DAG);
13269}
13270
13271SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
13272                                             DAGCombinerInfo &DCI) const {
13273  SelectionDAG &DAG = DCI.DAG;
13274  switch (N->getOpcode()) {
13275  default: break;
13276  case ISD::EXTRACT_VECTOR_ELT:
13277    return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this);
13278  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
13279  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
13280  case ISD::ADD:            return OptimizeConditionalInDecrement(N, DAG);
13281  case ISD::SUB:            return PerformSubCombine(N, DAG);
13282  case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
13283  case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
13284  case ISD::SHL:
13285  case ISD::SRA:
13286  case ISD::SRL:            return PerformShiftCombine(N, DAG, Subtarget);
13287  case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
13288  case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
13289  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
13290  case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
13291  case X86ISD::FXOR:
13292  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
13293  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
13294  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
13295  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
13296  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG);
13297  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG);
13298  case X86ISD::SHUFPS:      // Handle all target specific shuffles
13299  case X86ISD::SHUFPD:
13300  case X86ISD::PALIGN:
13301  case X86ISD::PUNPCKHBW:
13302  case X86ISD::PUNPCKHWD:
13303  case X86ISD::PUNPCKHDQ:
13304  case X86ISD::PUNPCKHQDQ:
13305  case X86ISD::UNPCKHPS:
13306  case X86ISD::UNPCKHPD:
13307  case X86ISD::VUNPCKHPSY:
13308  case X86ISD::VUNPCKHPDY:
13309  case X86ISD::PUNPCKLBW:
13310  case X86ISD::PUNPCKLWD:
13311  case X86ISD::PUNPCKLDQ:
13312  case X86ISD::PUNPCKLQDQ:
13313  case X86ISD::UNPCKLPS:
13314  case X86ISD::UNPCKLPD:
13315  case X86ISD::VUNPCKLPSY:
13316  case X86ISD::VUNPCKLPDY:
13317  case X86ISD::MOVHLPS:
13318  case X86ISD::MOVLHPS:
13319  case X86ISD::PSHUFD:
13320  case X86ISD::PSHUFHW:
13321  case X86ISD::PSHUFLW:
13322  case X86ISD::MOVSS:
13323  case X86ISD::MOVSD:
13324  case X86ISD::VPERMILPS:
13325  case X86ISD::VPERMILPSY:
13326  case X86ISD::VPERMILPD:
13327  case X86ISD::VPERMILPDY:
13328  case X86ISD::VPERM2F128:
13329  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
13330  }
13331
13332  return SDValue();
13333}
13334
13335/// isTypeDesirableForOp - Return true if the target has native support for
13336/// the specified value type and it is 'desirable' to use the type for the
13337/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
13338/// instruction encodings are longer and some i16 instructions are slow.
13339bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
13340  if (!isTypeLegal(VT))
13341    return false;
13342  if (VT != MVT::i16)
13343    return true;
13344
13345  switch (Opc) {
13346  default:
13347    return true;
13348  case ISD::LOAD:
13349  case ISD::SIGN_EXTEND:
13350  case ISD::ZERO_EXTEND:
13351  case ISD::ANY_EXTEND:
13352  case ISD::SHL:
13353  case ISD::SRL:
13354  case ISD::SUB:
13355  case ISD::ADD:
13356  case ISD::MUL:
13357  case ISD::AND:
13358  case ISD::OR:
13359  case ISD::XOR:
13360    return false;
13361  }
13362}
13363
13364/// IsDesirableToPromoteOp - This method query the target whether it is
13365/// beneficial for dag combiner to promote the specified node. If true, it
13366/// should return the desired promotion type by reference.
13367bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
13368  EVT VT = Op.getValueType();
13369  if (VT != MVT::i16)
13370    return false;
13371
13372  bool Promote = false;
13373  bool Commute = false;
13374  switch (Op.getOpcode()) {
13375  default: break;
13376  case ISD::LOAD: {
13377    LoadSDNode *LD = cast<LoadSDNode>(Op);
13378    // If the non-extending load has a single use and it's not live out, then it
13379    // might be folded.
13380    if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
13381                                                     Op.hasOneUse()*/) {
13382      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
13383             UE = Op.getNode()->use_end(); UI != UE; ++UI) {
13384        // The only case where we'd want to promote LOAD (rather then it being
13385        // promoted as an operand is when it's only use is liveout.
13386        if (UI->getOpcode() != ISD::CopyToReg)
13387          return false;
13388      }
13389    }
13390    Promote = true;
13391    break;
13392  }
13393  case ISD::SIGN_EXTEND:
13394  case ISD::ZERO_EXTEND:
13395  case ISD::ANY_EXTEND:
13396    Promote = true;
13397    break;
13398  case ISD::SHL:
13399  case ISD::SRL: {
13400    SDValue N0 = Op.getOperand(0);
13401    // Look out for (store (shl (load), x)).
13402    if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
13403      return false;
13404    Promote = true;
13405    break;
13406  }
13407  case ISD::ADD:
13408  case ISD::MUL:
13409  case ISD::AND:
13410  case ISD::OR:
13411  case ISD::XOR:
13412    Commute = true;
13413    // fallthrough
13414  case ISD::SUB: {
13415    SDValue N0 = Op.getOperand(0);
13416    SDValue N1 = Op.getOperand(1);
13417    if (!Commute && MayFoldLoad(N1))
13418      return false;
13419    // Avoid disabling potential load folding opportunities.
13420    if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
13421      return false;
13422    if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
13423      return false;
13424    Promote = true;
13425  }
13426  }
13427
13428  PVT = MVT::i32;
13429  return Promote;
13430}
13431
13432//===----------------------------------------------------------------------===//
13433//                           X86 Inline Assembly Support
13434//===----------------------------------------------------------------------===//
13435
13436bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
13437  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
13438
13439  std::string AsmStr = IA->getAsmString();
13440
13441  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
13442  SmallVector<StringRef, 4> AsmPieces;
13443  SplitString(AsmStr, AsmPieces, ";\n");
13444
13445  switch (AsmPieces.size()) {
13446  default: return false;
13447  case 1:
13448    AsmStr = AsmPieces[0];
13449    AsmPieces.clear();
13450    SplitString(AsmStr, AsmPieces, " \t");  // Split with whitespace.
13451
13452    // FIXME: this should verify that we are targeting a 486 or better.  If not,
13453    // we will turn this bswap into something that will be lowered to logical ops
13454    // instead of emitting the bswap asm.  For now, we don't support 486 or lower
13455    // so don't worry about this.
13456    // bswap $0
13457    if (AsmPieces.size() == 2 &&
13458        (AsmPieces[0] == "bswap" ||
13459         AsmPieces[0] == "bswapq" ||
13460         AsmPieces[0] == "bswapl") &&
13461        (AsmPieces[1] == "$0" ||
13462         AsmPieces[1] == "${0:q}")) {
13463      // No need to check constraints, nothing other than the equivalent of
13464      // "=r,0" would be valid here.
13465      IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
13466      if (!Ty || Ty->getBitWidth() % 16 != 0)
13467        return false;
13468      return IntrinsicLowering::LowerToByteSwap(CI);
13469    }
13470    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
13471    if (CI->getType()->isIntegerTy(16) &&
13472        AsmPieces.size() == 3 &&
13473        (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") &&
13474        AsmPieces[1] == "$$8," &&
13475        AsmPieces[2] == "${0:w}" &&
13476        IA->getConstraintString().compare(0, 5, "=r,0,") == 0) {
13477      AsmPieces.clear();
13478      const std::string &ConstraintsStr = IA->getConstraintString();
13479      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
13480      std::sort(AsmPieces.begin(), AsmPieces.end());
13481      if (AsmPieces.size() == 4 &&
13482          AsmPieces[0] == "~{cc}" &&
13483          AsmPieces[1] == "~{dirflag}" &&
13484          AsmPieces[2] == "~{flags}" &&
13485          AsmPieces[3] == "~{fpsr}") {
13486        IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
13487        if (!Ty || Ty->getBitWidth() % 16 != 0)
13488          return false;
13489        return IntrinsicLowering::LowerToByteSwap(CI);
13490      }
13491    }
13492    break;
13493  case 3:
13494    if (CI->getType()->isIntegerTy(32) &&
13495        IA->getConstraintString().compare(0, 5, "=r,0,") == 0) {
13496      SmallVector<StringRef, 4> Words;
13497      SplitString(AsmPieces[0], Words, " \t,");
13498      if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" &&
13499          Words[2] == "${0:w}") {
13500        Words.clear();
13501        SplitString(AsmPieces[1], Words, " \t,");
13502        if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" &&
13503            Words[2] == "$0") {
13504          Words.clear();
13505          SplitString(AsmPieces[2], Words, " \t,");
13506          if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" &&
13507              Words[2] == "${0:w}") {
13508            AsmPieces.clear();
13509            const std::string &ConstraintsStr = IA->getConstraintString();
13510            SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
13511            std::sort(AsmPieces.begin(), AsmPieces.end());
13512            if (AsmPieces.size() == 4 &&
13513                AsmPieces[0] == "~{cc}" &&
13514                AsmPieces[1] == "~{dirflag}" &&
13515                AsmPieces[2] == "~{flags}" &&
13516                AsmPieces[3] == "~{fpsr}") {
13517              IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
13518              if (!Ty || Ty->getBitWidth() % 16 != 0)
13519                return false;
13520              return IntrinsicLowering::LowerToByteSwap(CI);
13521            }
13522          }
13523        }
13524      }
13525    }
13526
13527    if (CI->getType()->isIntegerTy(64)) {
13528      InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
13529      if (Constraints.size() >= 2 &&
13530          Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
13531          Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
13532        // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
13533        SmallVector<StringRef, 4> Words;
13534        SplitString(AsmPieces[0], Words, " \t");
13535        if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
13536          Words.clear();
13537          SplitString(AsmPieces[1], Words, " \t");
13538          if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
13539            Words.clear();
13540            SplitString(AsmPieces[2], Words, " \t,");
13541            if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
13542                Words[2] == "%edx") {
13543              IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
13544              if (!Ty || Ty->getBitWidth() % 16 != 0)
13545                return false;
13546              return IntrinsicLowering::LowerToByteSwap(CI);
13547            }
13548          }
13549        }
13550      }
13551    }
13552    break;
13553  }
13554  return false;
13555}
13556
13557
13558
13559/// getConstraintType - Given a constraint letter, return the type of
13560/// constraint it is for this target.
13561X86TargetLowering::ConstraintType
13562X86TargetLowering::getConstraintType(const std::string &Constraint) const {
13563  if (Constraint.size() == 1) {
13564    switch (Constraint[0]) {
13565    case 'R':
13566    case 'q':
13567    case 'Q':
13568    case 'f':
13569    case 't':
13570    case 'u':
13571    case 'y':
13572    case 'x':
13573    case 'Y':
13574    case 'l':
13575      return C_RegisterClass;
13576    case 'a':
13577    case 'b':
13578    case 'c':
13579    case 'd':
13580    case 'S':
13581    case 'D':
13582    case 'A':
13583      return C_Register;
13584    case 'I':
13585    case 'J':
13586    case 'K':
13587    case 'L':
13588    case 'M':
13589    case 'N':
13590    case 'G':
13591    case 'C':
13592    case 'e':
13593    case 'Z':
13594      return C_Other;
13595    default:
13596      break;
13597    }
13598  }
13599  return TargetLowering::getConstraintType(Constraint);
13600}
13601
13602/// Examine constraint type and operand type and determine a weight value.
13603/// This object must already have been set up with the operand type
13604/// and the current alternative constraint selected.
13605TargetLowering::ConstraintWeight
13606  X86TargetLowering::getSingleConstraintMatchWeight(
13607    AsmOperandInfo &info, const char *constraint) const {
13608  ConstraintWeight weight = CW_Invalid;
13609  Value *CallOperandVal = info.CallOperandVal;
13610    // If we don't have a value, we can't do a match,
13611    // but allow it at the lowest weight.
13612  if (CallOperandVal == NULL)
13613    return CW_Default;
13614  Type *type = CallOperandVal->getType();
13615  // Look at the constraint type.
13616  switch (*constraint) {
13617  default:
13618    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
13619  case 'R':
13620  case 'q':
13621  case 'Q':
13622  case 'a':
13623  case 'b':
13624  case 'c':
13625  case 'd':
13626  case 'S':
13627  case 'D':
13628  case 'A':
13629    if (CallOperandVal->getType()->isIntegerTy())
13630      weight = CW_SpecificReg;
13631    break;
13632  case 'f':
13633  case 't':
13634  case 'u':
13635      if (type->isFloatingPointTy())
13636        weight = CW_SpecificReg;
13637      break;
13638  case 'y':
13639      if (type->isX86_MMXTy() && Subtarget->hasMMX())
13640        weight = CW_SpecificReg;
13641      break;
13642  case 'x':
13643  case 'Y':
13644    if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM())
13645      weight = CW_Register;
13646    break;
13647  case 'I':
13648    if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
13649      if (C->getZExtValue() <= 31)
13650        weight = CW_Constant;
13651    }
13652    break;
13653  case 'J':
13654    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
13655      if (C->getZExtValue() <= 63)
13656        weight = CW_Constant;
13657    }
13658    break;
13659  case 'K':
13660    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
13661      if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
13662        weight = CW_Constant;
13663    }
13664    break;
13665  case 'L':
13666    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
13667      if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
13668        weight = CW_Constant;
13669    }
13670    break;
13671  case 'M':
13672    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
13673      if (C->getZExtValue() <= 3)
13674        weight = CW_Constant;
13675    }
13676    break;
13677  case 'N':
13678    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
13679      if (C->getZExtValue() <= 0xff)
13680        weight = CW_Constant;
13681    }
13682    break;
13683  case 'G':
13684  case 'C':
13685    if (dyn_cast<ConstantFP>(CallOperandVal)) {
13686      weight = CW_Constant;
13687    }
13688    break;
13689  case 'e':
13690    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
13691      if ((C->getSExtValue() >= -0x80000000LL) &&
13692          (C->getSExtValue() <= 0x7fffffffLL))
13693        weight = CW_Constant;
13694    }
13695    break;
13696  case 'Z':
13697    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
13698      if (C->getZExtValue() <= 0xffffffff)
13699        weight = CW_Constant;
13700    }
13701    break;
13702  }
13703  return weight;
13704}
13705
13706/// LowerXConstraint - try to replace an X constraint, which matches anything,
13707/// with another that has more specific requirements based on the type of the
13708/// corresponding operand.
13709const char *X86TargetLowering::
13710LowerXConstraint(EVT ConstraintVT) const {
13711  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
13712  // 'f' like normal targets.
13713  if (ConstraintVT.isFloatingPoint()) {
13714    if (Subtarget->hasXMMInt())
13715      return "Y";
13716    if (Subtarget->hasXMM())
13717      return "x";
13718  }
13719
13720  return TargetLowering::LowerXConstraint(ConstraintVT);
13721}
13722
13723/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
13724/// vector.  If it is invalid, don't add anything to Ops.
13725void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
13726                                                     std::string &Constraint,
13727                                                     std::vector<SDValue>&Ops,
13728                                                     SelectionDAG &DAG) const {
13729  SDValue Result(0, 0);
13730
13731  // Only support length 1 constraints for now.
13732  if (Constraint.length() > 1) return;
13733
13734  char ConstraintLetter = Constraint[0];
13735  switch (ConstraintLetter) {
13736  default: break;
13737  case 'I':
13738    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
13739      if (C->getZExtValue() <= 31) {
13740        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
13741        break;
13742      }
13743    }
13744    return;
13745  case 'J':
13746    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
13747      if (C->getZExtValue() <= 63) {
13748        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
13749        break;
13750      }
13751    }
13752    return;
13753  case 'K':
13754    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
13755      if ((int8_t)C->getSExtValue() == C->getSExtValue()) {
13756        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
13757        break;
13758      }
13759    }
13760    return;
13761  case 'N':
13762    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
13763      if (C->getZExtValue() <= 255) {
13764        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
13765        break;
13766      }
13767    }
13768    return;
13769  case 'e': {
13770    // 32-bit signed value
13771    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
13772      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
13773                                           C->getSExtValue())) {
13774        // Widen to 64 bits here to get it sign extended.
13775        Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
13776        break;
13777      }
13778    // FIXME gcc accepts some relocatable values here too, but only in certain
13779    // memory models; it's complicated.
13780    }
13781    return;
13782  }
13783  case 'Z': {
13784    // 32-bit unsigned value
13785    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
13786      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
13787                                           C->getZExtValue())) {
13788        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
13789        break;
13790      }
13791    }
13792    // FIXME gcc accepts some relocatable values here too, but only in certain
13793    // memory models; it's complicated.
13794    return;
13795  }
13796  case 'i': {
13797    // Literal immediates are always ok.
13798    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
13799      // Widen to 64 bits here to get it sign extended.
13800      Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
13801      break;
13802    }
13803
13804    // In any sort of PIC mode addresses need to be computed at runtime by
13805    // adding in a register or some sort of table lookup.  These can't
13806    // be used as immediates.
13807    if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
13808      return;
13809
13810    // If we are in non-pic codegen mode, we allow the address of a global (with
13811    // an optional displacement) to be used with 'i'.
13812    GlobalAddressSDNode *GA = 0;
13813    int64_t Offset = 0;
13814
13815    // Match either (GA), (GA+C), (GA+C1+C2), etc.
13816    while (1) {
13817      if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
13818        Offset += GA->getOffset();
13819        break;
13820      } else if (Op.getOpcode() == ISD::ADD) {
13821        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
13822          Offset += C->getZExtValue();
13823          Op = Op.getOperand(0);
13824          continue;
13825        }
13826      } else if (Op.getOpcode() == ISD::SUB) {
13827        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
13828          Offset += -C->getZExtValue();
13829          Op = Op.getOperand(0);
13830          continue;
13831        }
13832      }
13833
13834      // Otherwise, this isn't something we can handle, reject it.
13835      return;
13836    }
13837
13838    const GlobalValue *GV = GA->getGlobal();
13839    // If we require an extra load to get this address, as in PIC mode, we
13840    // can't accept it.
13841    if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
13842                                                        getTargetMachine())))
13843      return;
13844
13845    Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
13846                                        GA->getValueType(0), Offset);
13847    break;
13848  }
13849  }
13850
13851  if (Result.getNode()) {
13852    Ops.push_back(Result);
13853    return;
13854  }
13855  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
13856}
13857
13858std::pair<unsigned, const TargetRegisterClass*>
13859X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
13860                                                EVT VT) const {
13861  // First, see if this is a constraint that directly corresponds to an LLVM
13862  // register class.
13863  if (Constraint.size() == 1) {
13864    // GCC Constraint Letters
13865    switch (Constraint[0]) {
13866    default: break;
13867      // TODO: Slight differences here in allocation order and leaving
13868      // RIP in the class. Do they matter any more here than they do
13869      // in the normal allocation?
13870    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
13871      if (Subtarget->is64Bit()) {
13872	if (VT == MVT::i32 || VT == MVT::f32)
13873	  return std::make_pair(0U, X86::GR32RegisterClass);
13874	else if (VT == MVT::i16)
13875	  return std::make_pair(0U, X86::GR16RegisterClass);
13876	else if (VT == MVT::i8 || VT == MVT::i1)
13877	  return std::make_pair(0U, X86::GR8RegisterClass);
13878	else if (VT == MVT::i64 || VT == MVT::f64)
13879	  return std::make_pair(0U, X86::GR64RegisterClass);
13880	break;
13881      }
13882      // 32-bit fallthrough
13883    case 'Q':   // Q_REGS
13884      if (VT == MVT::i32 || VT == MVT::f32)
13885	return std::make_pair(0U, X86::GR32_ABCDRegisterClass);
13886      else if (VT == MVT::i16)
13887	return std::make_pair(0U, X86::GR16_ABCDRegisterClass);
13888      else if (VT == MVT::i8 || VT == MVT::i1)
13889	return std::make_pair(0U, X86::GR8_ABCD_LRegisterClass);
13890      else if (VT == MVT::i64)
13891	return std::make_pair(0U, X86::GR64_ABCDRegisterClass);
13892      break;
13893    case 'r':   // GENERAL_REGS
13894    case 'l':   // INDEX_REGS
13895      if (VT == MVT::i8 || VT == MVT::i1)
13896        return std::make_pair(0U, X86::GR8RegisterClass);
13897      if (VT == MVT::i16)
13898        return std::make_pair(0U, X86::GR16RegisterClass);
13899      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
13900        return std::make_pair(0U, X86::GR32RegisterClass);
13901      return std::make_pair(0U, X86::GR64RegisterClass);
13902    case 'R':   // LEGACY_REGS
13903      if (VT == MVT::i8 || VT == MVT::i1)
13904        return std::make_pair(0U, X86::GR8_NOREXRegisterClass);
13905      if (VT == MVT::i16)
13906        return std::make_pair(0U, X86::GR16_NOREXRegisterClass);
13907      if (VT == MVT::i32 || !Subtarget->is64Bit())
13908        return std::make_pair(0U, X86::GR32_NOREXRegisterClass);
13909      return std::make_pair(0U, X86::GR64_NOREXRegisterClass);
13910    case 'f':  // FP Stack registers.
13911      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
13912      // value to the correct fpstack register class.
13913      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
13914        return std::make_pair(0U, X86::RFP32RegisterClass);
13915      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
13916        return std::make_pair(0U, X86::RFP64RegisterClass);
13917      return std::make_pair(0U, X86::RFP80RegisterClass);
13918    case 'y':   // MMX_REGS if MMX allowed.
13919      if (!Subtarget->hasMMX()) break;
13920      return std::make_pair(0U, X86::VR64RegisterClass);
13921    case 'Y':   // SSE_REGS if SSE2 allowed
13922      if (!Subtarget->hasXMMInt()) break;
13923      // FALL THROUGH.
13924    case 'x':   // SSE_REGS if SSE1 allowed
13925      if (!Subtarget->hasXMM()) break;
13926
13927      switch (VT.getSimpleVT().SimpleTy) {
13928      default: break;
13929      // Scalar SSE types.
13930      case MVT::f32:
13931      case MVT::i32:
13932        return std::make_pair(0U, X86::FR32RegisterClass);
13933      case MVT::f64:
13934      case MVT::i64:
13935        return std::make_pair(0U, X86::FR64RegisterClass);
13936      // Vector types.
13937      case MVT::v16i8:
13938      case MVT::v8i16:
13939      case MVT::v4i32:
13940      case MVT::v2i64:
13941      case MVT::v4f32:
13942      case MVT::v2f64:
13943        return std::make_pair(0U, X86::VR128RegisterClass);
13944      }
13945      break;
13946    }
13947  }
13948
13949  // Use the default implementation in TargetLowering to convert the register
13950  // constraint into a member of a register class.
13951  std::pair<unsigned, const TargetRegisterClass*> Res;
13952  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
13953
13954  // Not found as a standard register?
13955  if (Res.second == 0) {
13956    // Map st(0) -> st(7) -> ST0
13957    if (Constraint.size() == 7 && Constraint[0] == '{' &&
13958        tolower(Constraint[1]) == 's' &&
13959        tolower(Constraint[2]) == 't' &&
13960        Constraint[3] == '(' &&
13961        (Constraint[4] >= '0' && Constraint[4] <= '7') &&
13962        Constraint[5] == ')' &&
13963        Constraint[6] == '}') {
13964
13965      Res.first = X86::ST0+Constraint[4]-'0';
13966      Res.second = X86::RFP80RegisterClass;
13967      return Res;
13968    }
13969
13970    // GCC allows "st(0)" to be called just plain "st".
13971    if (StringRef("{st}").equals_lower(Constraint)) {
13972      Res.first = X86::ST0;
13973      Res.second = X86::RFP80RegisterClass;
13974      return Res;
13975    }
13976
13977    // flags -> EFLAGS
13978    if (StringRef("{flags}").equals_lower(Constraint)) {
13979      Res.first = X86::EFLAGS;
13980      Res.second = X86::CCRRegisterClass;
13981      return Res;
13982    }
13983
13984    // 'A' means EAX + EDX.
13985    if (Constraint == "A") {
13986      Res.first = X86::EAX;
13987      Res.second = X86::GR32_ADRegisterClass;
13988      return Res;
13989    }
13990    return Res;
13991  }
13992
13993  // Otherwise, check to see if this is a register class of the wrong value
13994  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
13995  // turn into {ax},{dx}.
13996  if (Res.second->hasType(VT))
13997    return Res;   // Correct type already, nothing to do.
13998
13999  // All of the single-register GCC register classes map their values onto
14000  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
14001  // really want an 8-bit or 32-bit register, map to the appropriate register
14002  // class and return the appropriate register.
14003  if (Res.second == X86::GR16RegisterClass) {
14004    if (VT == MVT::i8) {
14005      unsigned DestReg = 0;
14006      switch (Res.first) {
14007      default: break;
14008      case X86::AX: DestReg = X86::AL; break;
14009      case X86::DX: DestReg = X86::DL; break;
14010      case X86::CX: DestReg = X86::CL; break;
14011      case X86::BX: DestReg = X86::BL; break;
14012      }
14013      if (DestReg) {
14014        Res.first = DestReg;
14015        Res.second = X86::GR8RegisterClass;
14016      }
14017    } else if (VT == MVT::i32) {
14018      unsigned DestReg = 0;
14019      switch (Res.first) {
14020      default: break;
14021      case X86::AX: DestReg = X86::EAX; break;
14022      case X86::DX: DestReg = X86::EDX; break;
14023      case X86::CX: DestReg = X86::ECX; break;
14024      case X86::BX: DestReg = X86::EBX; break;
14025      case X86::SI: DestReg = X86::ESI; break;
14026      case X86::DI: DestReg = X86::EDI; break;
14027      case X86::BP: DestReg = X86::EBP; break;
14028      case X86::SP: DestReg = X86::ESP; break;
14029      }
14030      if (DestReg) {
14031        Res.first = DestReg;
14032        Res.second = X86::GR32RegisterClass;
14033      }
14034    } else if (VT == MVT::i64) {
14035      unsigned DestReg = 0;
14036      switch (Res.first) {
14037      default: break;
14038      case X86::AX: DestReg = X86::RAX; break;
14039      case X86::DX: DestReg = X86::RDX; break;
14040      case X86::CX: DestReg = X86::RCX; break;
14041      case X86::BX: DestReg = X86::RBX; break;
14042      case X86::SI: DestReg = X86::RSI; break;
14043      case X86::DI: DestReg = X86::RDI; break;
14044      case X86::BP: DestReg = X86::RBP; break;
14045      case X86::SP: DestReg = X86::RSP; break;
14046      }
14047      if (DestReg) {
14048        Res.first = DestReg;
14049        Res.second = X86::GR64RegisterClass;
14050      }
14051    }
14052  } else if (Res.second == X86::FR32RegisterClass ||
14053             Res.second == X86::FR64RegisterClass ||
14054             Res.second == X86::VR128RegisterClass) {
14055    // Handle references to XMM physical registers that got mapped into the
14056    // wrong class.  This can happen with constraints like {xmm0} where the
14057    // target independent register mapper will just pick the first match it can
14058    // find, ignoring the required type.
14059    if (VT == MVT::f32)
14060      Res.second = X86::FR32RegisterClass;
14061    else if (VT == MVT::f64)
14062      Res.second = X86::FR64RegisterClass;
14063    else if (X86::VR128RegisterClass->hasType(VT))
14064      Res.second = X86::VR128RegisterClass;
14065  }
14066
14067  return Res;
14068}
14069