X86ISelLowering.cpp revision a1fb1d2ed7342c7e6b491a78af073b5320bc9867
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "x86-isel"
16#include "X86ISelLowering.h"
17#include "X86.h"
18#include "X86InstrBuilder.h"
19#include "X86TargetMachine.h"
20#include "X86TargetObjectFile.h"
21#include "Utils/X86ShuffleDecode.h"
22#include "llvm/CallingConv.h"
23#include "llvm/Constants.h"
24#include "llvm/DerivedTypes.h"
25#include "llvm/GlobalAlias.h"
26#include "llvm/GlobalVariable.h"
27#include "llvm/Function.h"
28#include "llvm/Instructions.h"
29#include "llvm/Intrinsics.h"
30#include "llvm/LLVMContext.h"
31#include "llvm/CodeGen/IntrinsicLowering.h"
32#include "llvm/CodeGen/MachineFrameInfo.h"
33#include "llvm/CodeGen/MachineFunction.h"
34#include "llvm/CodeGen/MachineInstrBuilder.h"
35#include "llvm/CodeGen/MachineJumpTableInfo.h"
36#include "llvm/CodeGen/MachineModuleInfo.h"
37#include "llvm/CodeGen/MachineRegisterInfo.h"
38#include "llvm/MC/MCAsmInfo.h"
39#include "llvm/MC/MCContext.h"
40#include "llvm/MC/MCExpr.h"
41#include "llvm/MC/MCSymbol.h"
42#include "llvm/ADT/SmallSet.h"
43#include "llvm/ADT/Statistic.h"
44#include "llvm/ADT/StringExtras.h"
45#include "llvm/ADT/VariadicFunction.h"
46#include "llvm/Support/CallSite.h"
47#include "llvm/Support/Debug.h"
48#include "llvm/Support/ErrorHandling.h"
49#include "llvm/Support/MathExtras.h"
50#include "llvm/Target/TargetOptions.h"
51#include <bitset>
52#include <cctype>
53using namespace llvm;
54
55STATISTIC(NumTailCalls, "Number of tail calls");
56
57// Forward declarations.
58static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
59                       SDValue V2);
60
61/// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
62/// sets things up to match to an AVX VEXTRACTF128 instruction or a
63/// simple subregister reference.  Idx is an index in the 128 bits we
64/// want.  It need not be aligned to a 128-bit bounday.  That makes
65/// lowering EXTRACT_VECTOR_ELT operations easier.
66static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
67                                   SelectionDAG &DAG, DebugLoc dl) {
68  EVT VT = Vec.getValueType();
69  assert(VT.is256BitVector() && "Unexpected vector size!");
70  EVT ElVT = VT.getVectorElementType();
71  unsigned Factor = VT.getSizeInBits()/128;
72  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
73                                  VT.getVectorNumElements()/Factor);
74
75  // Extract from UNDEF is UNDEF.
76  if (Vec.getOpcode() == ISD::UNDEF)
77    return DAG.getUNDEF(ResultVT);
78
79  // Extract the relevant 128 bits.  Generate an EXTRACT_SUBVECTOR
80  // we can match to VEXTRACTF128.
81  unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
82
83  // This is the index of the first element of the 128-bit chunk
84  // we want.
85  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
86                               * ElemsPerChunk);
87
88  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
89  SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
90                               VecIdx);
91
92  return Result;
93}
94
95/// Generate a DAG to put 128-bits into a vector > 128 bits.  This
96/// sets things up to match to an AVX VINSERTF128 instruction or a
97/// simple superregister reference.  Idx is an index in the 128 bits
98/// we want.  It need not be aligned to a 128-bit bounday.  That makes
99/// lowering INSERT_VECTOR_ELT operations easier.
100static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
101                                  unsigned IdxVal, SelectionDAG &DAG,
102                                  DebugLoc dl) {
103  // Inserting UNDEF is Result
104  if (Vec.getOpcode() == ISD::UNDEF)
105    return Result;
106
107  EVT VT = Vec.getValueType();
108  assert(VT.is128BitVector() && "Unexpected vector size!");
109
110  EVT ElVT = VT.getVectorElementType();
111  EVT ResultVT = Result.getValueType();
112
113  // Insert the relevant 128 bits.
114  unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
115
116  // This is the index of the first element of the 128-bit chunk
117  // we want.
118  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
119                               * ElemsPerChunk);
120
121  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
122  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
123                     VecIdx);
124}
125
126/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
127/// instructions. This is used because creating CONCAT_VECTOR nodes of
128/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
129/// large BUILD_VECTORS.
130static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
131                                   unsigned NumElems, SelectionDAG &DAG,
132                                   DebugLoc dl) {
133  SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
134  return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
135}
136
137static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
138  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
139  bool is64Bit = Subtarget->is64Bit();
140
141  if (Subtarget->isTargetEnvMacho()) {
142    if (is64Bit)
143      return new X86_64MachoTargetObjectFile();
144    return new TargetLoweringObjectFileMachO();
145  }
146
147  if (Subtarget->isTargetLinux())
148    return new X86LinuxTargetObjectFile();
149  if (Subtarget->isTargetELF())
150    return new TargetLoweringObjectFileELF();
151  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
152    return new TargetLoweringObjectFileCOFF();
153  llvm_unreachable("unknown subtarget type");
154}
155
156X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
157  : TargetLowering(TM, createTLOF(TM)) {
158  Subtarget = &TM.getSubtarget<X86Subtarget>();
159  X86ScalarSSEf64 = Subtarget->hasSSE2();
160  X86ScalarSSEf32 = Subtarget->hasSSE1();
161  X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
162
163  RegInfo = TM.getRegisterInfo();
164  TD = getTargetData();
165
166  // Set up the TargetLowering object.
167  static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
168
169  // X86 is weird, it always uses i8 for shift amounts and setcc results.
170  setBooleanContents(ZeroOrOneBooleanContent);
171  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
172  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
173
174  // For 64-bit since we have so many registers use the ILP scheduler, for
175  // 32-bit code use the register pressure specific scheduling.
176  // For Atom, always use ILP scheduling.
177  if (Subtarget->isAtom())
178    setSchedulingPreference(Sched::ILP);
179  else if (Subtarget->is64Bit())
180    setSchedulingPreference(Sched::ILP);
181  else
182    setSchedulingPreference(Sched::RegPressure);
183  setStackPointerRegisterToSaveRestore(X86StackPtr);
184
185  // Bypass i32 with i8 on Atom when compiling with O2
186  if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default)
187    addBypassSlowDivType(Type::getInt32Ty(getGlobalContext()), Type::getInt8Ty(getGlobalContext()));
188
189  if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
190    // Setup Windows compiler runtime calls.
191    setLibcallName(RTLIB::SDIV_I64, "_alldiv");
192    setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
193    setLibcallName(RTLIB::SREM_I64, "_allrem");
194    setLibcallName(RTLIB::UREM_I64, "_aullrem");
195    setLibcallName(RTLIB::MUL_I64, "_allmul");
196    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
197    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
198    setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
199    setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
200    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
201
202    // The _ftol2 runtime function has an unusual calling conv, which
203    // is modeled by a special pseudo-instruction.
204    setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
205    setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
206    setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
207    setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
208  }
209
210  if (Subtarget->isTargetDarwin()) {
211    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
212    setUseUnderscoreSetJmp(false);
213    setUseUnderscoreLongJmp(false);
214  } else if (Subtarget->isTargetMingw()) {
215    // MS runtime is weird: it exports _setjmp, but longjmp!
216    setUseUnderscoreSetJmp(true);
217    setUseUnderscoreLongJmp(false);
218  } else {
219    setUseUnderscoreSetJmp(true);
220    setUseUnderscoreLongJmp(true);
221  }
222
223  // Set up the register classes.
224  addRegisterClass(MVT::i8, &X86::GR8RegClass);
225  addRegisterClass(MVT::i16, &X86::GR16RegClass);
226  addRegisterClass(MVT::i32, &X86::GR32RegClass);
227  if (Subtarget->is64Bit())
228    addRegisterClass(MVT::i64, &X86::GR64RegClass);
229
230  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
231
232  // We don't accept any truncstore of integer registers.
233  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
234  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
235  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
236  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
237  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
238  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
239
240  // SETOEQ and SETUNE require checking two conditions.
241  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
242  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
243  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
244  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
245  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
246  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
247
248  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
249  // operation.
250  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
251  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
252  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
253
254  if (Subtarget->is64Bit()) {
255    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
256    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
257  } else if (!TM.Options.UseSoftFloat) {
258    // We have an algorithm for SSE2->double, and we turn this into a
259    // 64-bit FILD followed by conditional FADD for other targets.
260    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
261    // We have an algorithm for SSE2, and we turn this into a 64-bit
262    // FILD for other targets.
263    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
264  }
265
266  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
267  // this operation.
268  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
269  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
270
271  if (!TM.Options.UseSoftFloat) {
272    // SSE has no i16 to fp conversion, only i32
273    if (X86ScalarSSEf32) {
274      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
275      // f32 and f64 cases are Legal, f80 case is not
276      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
277    } else {
278      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
279      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
280    }
281  } else {
282    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
283    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
284  }
285
286  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
287  // are Legal, f80 is custom lowered.
288  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
289  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
290
291  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
292  // this operation.
293  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
294  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
295
296  if (X86ScalarSSEf32) {
297    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
298    // f32 and f64 cases are Legal, f80 case is not
299    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
300  } else {
301    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
302    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
303  }
304
305  // Handle FP_TO_UINT by promoting the destination to a larger signed
306  // conversion.
307  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
308  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
309  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
310
311  if (Subtarget->is64Bit()) {
312    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
313    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
314  } else if (!TM.Options.UseSoftFloat) {
315    // Since AVX is a superset of SSE3, only check for SSE here.
316    if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
317      // Expand FP_TO_UINT into a select.
318      // FIXME: We would like to use a Custom expander here eventually to do
319      // the optimal thing for SSE vs. the default expansion in the legalizer.
320      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
321    else
322      // With SSE3 we can use fisttpll to convert to a signed i64; without
323      // SSE, we're stuck with a fistpll.
324      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
325  }
326
327  if (isTargetFTOL()) {
328    // Use the _ftol2 runtime function, which has a pseudo-instruction
329    // to handle its weird calling convention.
330    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
331  }
332
333  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
334  if (!X86ScalarSSEf64) {
335    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
336    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
337    if (Subtarget->is64Bit()) {
338      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
339      // Without SSE, i64->f64 goes through memory.
340      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
341    }
342  }
343
344  // Scalar integer divide and remainder are lowered to use operations that
345  // produce two results, to match the available instructions. This exposes
346  // the two-result form to trivial CSE, which is able to combine x/y and x%y
347  // into a single instruction.
348  //
349  // Scalar integer multiply-high is also lowered to use two-result
350  // operations, to match the available instructions. However, plain multiply
351  // (low) operations are left as Legal, as there are single-result
352  // instructions for this in x86. Using the two-result multiply instructions
353  // when both high and low results are needed must be arranged by dagcombine.
354  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
355    MVT VT = IntVTs[i];
356    setOperationAction(ISD::MULHS, VT, Expand);
357    setOperationAction(ISD::MULHU, VT, Expand);
358    setOperationAction(ISD::SDIV, VT, Expand);
359    setOperationAction(ISD::UDIV, VT, Expand);
360    setOperationAction(ISD::SREM, VT, Expand);
361    setOperationAction(ISD::UREM, VT, Expand);
362
363    // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
364    setOperationAction(ISD::ADDC, VT, Custom);
365    setOperationAction(ISD::ADDE, VT, Custom);
366    setOperationAction(ISD::SUBC, VT, Custom);
367    setOperationAction(ISD::SUBE, VT, Custom);
368  }
369
370  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
371  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
372  setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
373  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
374  if (Subtarget->is64Bit())
375    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
376  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
377  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
378  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
379  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
380  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
381  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
382  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
383  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
384
385  // Promote the i8 variants and force them on up to i32 which has a shorter
386  // encoding.
387  setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
388  AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
389  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
390  AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
391  if (Subtarget->hasBMI()) {
392    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
393    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
394    if (Subtarget->is64Bit())
395      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
396  } else {
397    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
398    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
399    if (Subtarget->is64Bit())
400      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
401  }
402
403  if (Subtarget->hasLZCNT()) {
404    // When promoting the i8 variants, force them to i32 for a shorter
405    // encoding.
406    setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
407    AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
408    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
409    AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
410    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
411    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
412    if (Subtarget->is64Bit())
413      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
414  } else {
415    setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
416    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
417    setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
418    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
419    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
420    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
421    if (Subtarget->is64Bit()) {
422      setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
423      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
424    }
425  }
426
427  if (Subtarget->hasPOPCNT()) {
428    setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
429  } else {
430    setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
431    setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
432    setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
433    if (Subtarget->is64Bit())
434      setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
435  }
436
437  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
438  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
439
440  // These should be promoted to a larger select which is supported.
441  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
442  // X86 wants to expand cmov itself.
443  setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
444  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
445  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
446  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
447  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
448  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
449  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
450  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
451  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
452  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
453  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
454  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
455  if (Subtarget->is64Bit()) {
456    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
457    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
458  }
459  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
460
461  // Darwin ABI issue.
462  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
463  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
464  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
465  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
466  if (Subtarget->is64Bit())
467    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
468  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
469  setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
470  if (Subtarget->is64Bit()) {
471    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
472    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
473    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
474    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
475    setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
476  }
477  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
478  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
479  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
480  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
481  if (Subtarget->is64Bit()) {
482    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
483    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
484    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
485  }
486
487  if (Subtarget->hasSSE1())
488    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
489
490  setOperationAction(ISD::MEMBARRIER    , MVT::Other, Custom);
491  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
492
493  // On X86 and X86-64, atomic operations are lowered to locked instructions.
494  // Locked instructions, in turn, have implicit fence semantics (all memory
495  // operations are flushed before issuing the locked instruction, and they
496  // are not buffered), so we can fold away the common pattern of
497  // fence-atomic-fence.
498  setShouldFoldAtomicFences(true);
499
500  // Expand certain atomics
501  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
502    MVT VT = IntVTs[i];
503    setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
504    setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
505    setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
506  }
507
508  if (!Subtarget->is64Bit()) {
509    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
510    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
511    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
512    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
513    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
514    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
515    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
516    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
517  }
518
519  if (Subtarget->hasCmpxchg16b()) {
520    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
521  }
522
523  // FIXME - use subtarget debug flags
524  if (!Subtarget->isTargetDarwin() &&
525      !Subtarget->isTargetELF() &&
526      !Subtarget->isTargetCygMing()) {
527    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
528  }
529
530  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
531  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
532  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
533  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
534  if (Subtarget->is64Bit()) {
535    setExceptionPointerRegister(X86::RAX);
536    setExceptionSelectorRegister(X86::RDX);
537  } else {
538    setExceptionPointerRegister(X86::EAX);
539    setExceptionSelectorRegister(X86::EDX);
540  }
541  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
542  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
543
544  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
545  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
546
547  setOperationAction(ISD::TRAP, MVT::Other, Legal);
548
549  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
550  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
551  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
552  if (Subtarget->is64Bit()) {
553    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
554    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
555  } else {
556    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
557    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
558  }
559
560  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
561  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
562
563  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
564    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
565                       MVT::i64 : MVT::i32, Custom);
566  else if (TM.Options.EnableSegmentedStacks)
567    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
568                       MVT::i64 : MVT::i32, Custom);
569  else
570    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
571                       MVT::i64 : MVT::i32, Expand);
572
573  if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
574    // f32 and f64 use SSE.
575    // Set up the FP register classes.
576    addRegisterClass(MVT::f32, &X86::FR32RegClass);
577    addRegisterClass(MVT::f64, &X86::FR64RegClass);
578
579    // Use ANDPD to simulate FABS.
580    setOperationAction(ISD::FABS , MVT::f64, Custom);
581    setOperationAction(ISD::FABS , MVT::f32, Custom);
582
583    // Use XORP to simulate FNEG.
584    setOperationAction(ISD::FNEG , MVT::f64, Custom);
585    setOperationAction(ISD::FNEG , MVT::f32, Custom);
586
587    // Use ANDPD and ORPD to simulate FCOPYSIGN.
588    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
589    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
590
591    // Lower this to FGETSIGNx86 plus an AND.
592    setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
593    setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
594
595    // We don't support sin/cos/fmod
596    setOperationAction(ISD::FSIN , MVT::f64, Expand);
597    setOperationAction(ISD::FCOS , MVT::f64, Expand);
598    setOperationAction(ISD::FSIN , MVT::f32, Expand);
599    setOperationAction(ISD::FCOS , MVT::f32, Expand);
600
601    // Expand FP immediates into loads from the stack, except for the special
602    // cases we handle.
603    addLegalFPImmediate(APFloat(+0.0)); // xorpd
604    addLegalFPImmediate(APFloat(+0.0f)); // xorps
605  } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
606    // Use SSE for f32, x87 for f64.
607    // Set up the FP register classes.
608    addRegisterClass(MVT::f32, &X86::FR32RegClass);
609    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
610
611    // Use ANDPS to simulate FABS.
612    setOperationAction(ISD::FABS , MVT::f32, Custom);
613
614    // Use XORP to simulate FNEG.
615    setOperationAction(ISD::FNEG , MVT::f32, Custom);
616
617    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
618
619    // Use ANDPS and ORPS to simulate FCOPYSIGN.
620    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
621    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
622
623    // We don't support sin/cos/fmod
624    setOperationAction(ISD::FSIN , MVT::f32, Expand);
625    setOperationAction(ISD::FCOS , MVT::f32, Expand);
626
627    // Special cases we handle for FP constants.
628    addLegalFPImmediate(APFloat(+0.0f)); // xorps
629    addLegalFPImmediate(APFloat(+0.0)); // FLD0
630    addLegalFPImmediate(APFloat(+1.0)); // FLD1
631    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
632    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
633
634    if (!TM.Options.UnsafeFPMath) {
635      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
636      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
637    }
638  } else if (!TM.Options.UseSoftFloat) {
639    // f32 and f64 in x87.
640    // Set up the FP register classes.
641    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
642    addRegisterClass(MVT::f32, &X86::RFP32RegClass);
643
644    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
645    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
646    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
647    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
648
649    if (!TM.Options.UnsafeFPMath) {
650      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
651      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
652    }
653    addLegalFPImmediate(APFloat(+0.0)); // FLD0
654    addLegalFPImmediate(APFloat(+1.0)); // FLD1
655    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
656    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
657    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
658    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
659    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
660    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
661  }
662
663  // We don't support FMA.
664  setOperationAction(ISD::FMA, MVT::f64, Expand);
665  setOperationAction(ISD::FMA, MVT::f32, Expand);
666
667  // Long double always uses X87.
668  if (!TM.Options.UseSoftFloat) {
669    addRegisterClass(MVT::f80, &X86::RFP80RegClass);
670    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
671    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
672    {
673      APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
674      addLegalFPImmediate(TmpFlt);  // FLD0
675      TmpFlt.changeSign();
676      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
677
678      bool ignored;
679      APFloat TmpFlt2(+1.0);
680      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
681                      &ignored);
682      addLegalFPImmediate(TmpFlt2);  // FLD1
683      TmpFlt2.changeSign();
684      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
685    }
686
687    if (!TM.Options.UnsafeFPMath) {
688      setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
689      setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
690    }
691
692    setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
693    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
694    setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
695    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
696    setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
697    setOperationAction(ISD::FMA, MVT::f80, Expand);
698  }
699
700  // Always use a library call for pow.
701  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
702  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
703  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
704
705  setOperationAction(ISD::FLOG, MVT::f80, Expand);
706  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
707  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
708  setOperationAction(ISD::FEXP, MVT::f80, Expand);
709  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
710
711  // First set operation action for all vector types to either promote
712  // (for widening) or expand (for scalarization). Then we will selectively
713  // turn on ones that can be effectively codegen'd.
714  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
715           VT <= MVT::LAST_VECTOR_VALUETYPE; ++VT) {
716    setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
717    setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
718    setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
719    setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
720    setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
721    setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
722    setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
723    setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
724    setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
725    setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
726    setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
727    setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
728    setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
729    setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand);
730    setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand);
731    setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand);
732    setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
733    setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
734    setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
735    setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
736    setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
737    setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
738    setOperationAction(ISD::FMA,  (MVT::SimpleValueType)VT, Expand);
739    setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
740    setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
741    setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
742    setOperationAction(ISD::FFLOOR, (MVT::SimpleValueType)VT, Expand);
743    setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
744    setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
745    setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
746    setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
747    setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
748    setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
749    setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
750    setOperationAction(ISD::CTTZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand);
751    setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
752    setOperationAction(ISD::CTLZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand);
753    setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
754    setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
755    setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
756    setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
757    setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
758    setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
759    setOperationAction(ISD::SETCC, (MVT::SimpleValueType)VT, Expand);
760    setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
761    setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
762    setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
763    setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
764    setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
765    setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand);
766    setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand);
767    setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
768    setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
769    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand);
770    setOperationAction(ISD::TRUNCATE,  (MVT::SimpleValueType)VT, Expand);
771    setOperationAction(ISD::SIGN_EXTEND,  (MVT::SimpleValueType)VT, Expand);
772    setOperationAction(ISD::ZERO_EXTEND,  (MVT::SimpleValueType)VT, Expand);
773    setOperationAction(ISD::ANY_EXTEND,  (MVT::SimpleValueType)VT, Expand);
774    setOperationAction(ISD::VSELECT,  (MVT::SimpleValueType)VT, Expand);
775    for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
776             InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
777      setTruncStoreAction((MVT::SimpleValueType)VT,
778                          (MVT::SimpleValueType)InnerVT, Expand);
779    setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
780    setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
781    setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
782  }
783
784  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
785  // with -msoft-float, disable use of MMX as well.
786  if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
787    addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
788    // No operations on x86mmx supported, everything uses intrinsics.
789  }
790
791  // MMX-sized vectors (other than x86mmx) are expected to be expanded
792  // into smaller operations.
793  setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
794  setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
795  setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
796  setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
797  setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
798  setOperationAction(ISD::AND,                MVT::v4i16, Expand);
799  setOperationAction(ISD::AND,                MVT::v2i32, Expand);
800  setOperationAction(ISD::AND,                MVT::v1i64, Expand);
801  setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
802  setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
803  setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
804  setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
805  setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
806  setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
807  setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
808  setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
809  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
810  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
811  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
812  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
813  setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
814  setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
815  setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
816  setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
817  setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
818  setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
819  setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
820  setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
821  setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
822
823  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
824    addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
825
826    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
827    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
828    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
829    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
830    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
831    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
832    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
833    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
834    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
835    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
836    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
837  }
838
839  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
840    addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
841
842    // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
843    // registers cannot be used even for integer operations.
844    addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
845    addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
846    addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
847    addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
848
849    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
850    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
851    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
852    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
853    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
854    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
855    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
856    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
857    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
858    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
859    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
860    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
861    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
862    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
863    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
864    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
865
866    setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
867    setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
868    setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
869    setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
870
871    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
872    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
873    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
874    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
875    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
876
877    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
878    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
879      MVT VT = (MVT::SimpleValueType)i;
880      // Do not attempt to custom lower non-power-of-2 vectors
881      if (!isPowerOf2_32(VT.getVectorNumElements()))
882        continue;
883      // Do not attempt to custom lower non-128-bit vectors
884      if (!VT.is128BitVector())
885        continue;
886      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
887      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
888      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
889    }
890
891    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
892    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
893    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
894    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
895    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
896    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
897
898    if (Subtarget->is64Bit()) {
899      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
900      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
901    }
902
903    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
904    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
905      MVT VT = (MVT::SimpleValueType)i;
906
907      // Do not attempt to promote non-128-bit vectors
908      if (!VT.is128BitVector())
909        continue;
910
911      setOperationAction(ISD::AND,    VT, Promote);
912      AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
913      setOperationAction(ISD::OR,     VT, Promote);
914      AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
915      setOperationAction(ISD::XOR,    VT, Promote);
916      AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
917      setOperationAction(ISD::LOAD,   VT, Promote);
918      AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
919      setOperationAction(ISD::SELECT, VT, Promote);
920      AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
921    }
922
923    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
924
925    // Custom lower v2i64 and v2f64 selects.
926    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
927    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
928    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
929    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
930
931    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
932    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
933  }
934
935  if (Subtarget->hasSSE41()) {
936    setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
937    setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
938    setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
939    setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
940    setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
941    setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
942    setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
943    setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
944    setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
945    setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
946
947    // FIXME: Do we need to handle scalar-to-vector here?
948    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
949
950    setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
951    setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
952    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
953    setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
954    setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
955
956    // i8 and i16 vectors are custom , because the source register and source
957    // source memory operand types are not the same width.  f32 vectors are
958    // custom since the immediate controlling the insert encodes additional
959    // information.
960    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
961    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
962    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
963    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
964
965    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
966    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
967    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
968    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
969
970    // FIXME: these should be Legal but thats only for the case where
971    // the index is constant.  For now custom expand to deal with that.
972    if (Subtarget->is64Bit()) {
973      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
974      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
975    }
976  }
977
978  if (Subtarget->hasSSE2()) {
979    setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
980    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
981
982    setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
983    setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
984
985    setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
986    setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
987
988    if (Subtarget->hasAVX2()) {
989      setOperationAction(ISD::SRL,             MVT::v2i64, Legal);
990      setOperationAction(ISD::SRL,             MVT::v4i32, Legal);
991
992      setOperationAction(ISD::SHL,             MVT::v2i64, Legal);
993      setOperationAction(ISD::SHL,             MVT::v4i32, Legal);
994
995      setOperationAction(ISD::SRA,             MVT::v4i32, Legal);
996    } else {
997      setOperationAction(ISD::SRL,             MVT::v2i64, Custom);
998      setOperationAction(ISD::SRL,             MVT::v4i32, Custom);
999
1000      setOperationAction(ISD::SHL,             MVT::v2i64, Custom);
1001      setOperationAction(ISD::SHL,             MVT::v4i32, Custom);
1002
1003      setOperationAction(ISD::SRA,             MVT::v4i32, Custom);
1004    }
1005  }
1006
1007  if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) {
1008    addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
1009    addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1010    addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
1011    addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
1012    addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
1013    addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
1014
1015    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
1016    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
1017    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
1018
1019    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
1020    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
1021    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
1022    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
1023    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
1024    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
1025
1026    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
1027    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
1028    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
1029    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
1030    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
1031    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
1032
1033    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1034    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1035    setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1036
1037    setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
1038    setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
1039
1040    setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
1041    setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
1042
1043    setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
1044    setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
1045
1046    setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
1047    setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
1048    setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
1049    setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
1050
1051    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1052    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1053    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1054
1055    setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
1056    setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
1057    setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
1058    setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
1059
1060    if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1061      setOperationAction(ISD::FMA,             MVT::v8f32, Custom);
1062      setOperationAction(ISD::FMA,             MVT::v4f64, Custom);
1063      setOperationAction(ISD::FMA,             MVT::v4f32, Custom);
1064      setOperationAction(ISD::FMA,             MVT::v2f64, Custom);
1065      setOperationAction(ISD::FMA,             MVT::f32, Custom);
1066      setOperationAction(ISD::FMA,             MVT::f64, Custom);
1067    }
1068
1069    if (Subtarget->hasAVX2()) {
1070      setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
1071      setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
1072      setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
1073      setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
1074
1075      setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
1076      setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
1077      setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
1078      setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
1079
1080      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1081      setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
1082      setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
1083      // Don't lower v32i8 because there is no 128-bit byte mul
1084
1085      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1086
1087      setOperationAction(ISD::SRL,             MVT::v4i64, Legal);
1088      setOperationAction(ISD::SRL,             MVT::v8i32, Legal);
1089
1090      setOperationAction(ISD::SHL,             MVT::v4i64, Legal);
1091      setOperationAction(ISD::SHL,             MVT::v8i32, Legal);
1092
1093      setOperationAction(ISD::SRA,             MVT::v8i32, Legal);
1094    } else {
1095      setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
1096      setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
1097      setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
1098      setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
1099
1100      setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
1101      setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
1102      setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
1103      setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
1104
1105      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1106      setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
1107      setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
1108      // Don't lower v32i8 because there is no 128-bit byte mul
1109
1110      setOperationAction(ISD::SRL,             MVT::v4i64, Custom);
1111      setOperationAction(ISD::SRL,             MVT::v8i32, Custom);
1112
1113      setOperationAction(ISD::SHL,             MVT::v4i64, Custom);
1114      setOperationAction(ISD::SHL,             MVT::v8i32, Custom);
1115
1116      setOperationAction(ISD::SRA,             MVT::v8i32, Custom);
1117    }
1118
1119    // Custom lower several nodes for 256-bit types.
1120    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
1121             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
1122      MVT VT = (MVT::SimpleValueType)i;
1123
1124      // Extract subvector is special because the value type
1125      // (result) is 128-bit but the source is 256-bit wide.
1126      if (VT.is128BitVector())
1127        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1128
1129      // Do not attempt to custom lower other non-256-bit vectors
1130      if (!VT.is256BitVector())
1131        continue;
1132
1133      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1134      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1135      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1136      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1137      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1138      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1139      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1140    }
1141
1142    // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1143    for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1144      MVT VT = (MVT::SimpleValueType)i;
1145
1146      // Do not attempt to promote non-256-bit vectors
1147      if (!VT.is256BitVector())
1148        continue;
1149
1150      setOperationAction(ISD::AND,    VT, Promote);
1151      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
1152      setOperationAction(ISD::OR,     VT, Promote);
1153      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
1154      setOperationAction(ISD::XOR,    VT, Promote);
1155      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
1156      setOperationAction(ISD::LOAD,   VT, Promote);
1157      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
1158      setOperationAction(ISD::SELECT, VT, Promote);
1159      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1160    }
1161  }
1162
1163  // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1164  // of this type with custom code.
1165  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
1166           VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
1167    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
1168                       Custom);
1169  }
1170
1171  // We want to custom lower some of our intrinsics.
1172  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1173  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1174
1175
1176  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1177  // handle type legalization for these operations here.
1178  //
1179  // FIXME: We really should do custom legalization for addition and
1180  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1181  // than generic legalization for 64-bit multiplication-with-overflow, though.
1182  for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1183    // Add/Sub/Mul with overflow operations are custom lowered.
1184    MVT VT = IntVTs[i];
1185    setOperationAction(ISD::SADDO, VT, Custom);
1186    setOperationAction(ISD::UADDO, VT, Custom);
1187    setOperationAction(ISD::SSUBO, VT, Custom);
1188    setOperationAction(ISD::USUBO, VT, Custom);
1189    setOperationAction(ISD::SMULO, VT, Custom);
1190    setOperationAction(ISD::UMULO, VT, Custom);
1191  }
1192
1193  // There are no 8-bit 3-address imul/mul instructions
1194  setOperationAction(ISD::SMULO, MVT::i8, Expand);
1195  setOperationAction(ISD::UMULO, MVT::i8, Expand);
1196
1197  if (!Subtarget->is64Bit()) {
1198    // These libcalls are not available in 32-bit.
1199    setLibcallName(RTLIB::SHL_I128, 0);
1200    setLibcallName(RTLIB::SRL_I128, 0);
1201    setLibcallName(RTLIB::SRA_I128, 0);
1202  }
1203
1204  // We have target-specific dag combine patterns for the following nodes:
1205  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1206  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1207  setTargetDAGCombine(ISD::VSELECT);
1208  setTargetDAGCombine(ISD::SELECT);
1209  setTargetDAGCombine(ISD::SHL);
1210  setTargetDAGCombine(ISD::SRA);
1211  setTargetDAGCombine(ISD::SRL);
1212  setTargetDAGCombine(ISD::OR);
1213  setTargetDAGCombine(ISD::AND);
1214  setTargetDAGCombine(ISD::ADD);
1215  setTargetDAGCombine(ISD::FADD);
1216  setTargetDAGCombine(ISD::FSUB);
1217  setTargetDAGCombine(ISD::FMA);
1218  setTargetDAGCombine(ISD::SUB);
1219  setTargetDAGCombine(ISD::LOAD);
1220  setTargetDAGCombine(ISD::STORE);
1221  setTargetDAGCombine(ISD::ZERO_EXTEND);
1222  setTargetDAGCombine(ISD::ANY_EXTEND);
1223  setTargetDAGCombine(ISD::SIGN_EXTEND);
1224  setTargetDAGCombine(ISD::TRUNCATE);
1225  setTargetDAGCombine(ISD::UINT_TO_FP);
1226  setTargetDAGCombine(ISD::SINT_TO_FP);
1227  setTargetDAGCombine(ISD::SETCC);
1228  setTargetDAGCombine(ISD::FP_TO_SINT);
1229  if (Subtarget->is64Bit())
1230    setTargetDAGCombine(ISD::MUL);
1231  setTargetDAGCombine(ISD::XOR);
1232
1233  computeRegisterProperties();
1234
1235  // On Darwin, -Os means optimize for size without hurting performance,
1236  // do not reduce the limit.
1237  maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1238  maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1239  maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1240  maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1241  maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1242  maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1243  setPrefLoopAlignment(4); // 2^4 bytes.
1244  benefitFromCodePlacementOpt = true;
1245
1246  // Predictable cmov don't hurt on atom because it's in-order.
1247  predictableSelectIsExpensive = !Subtarget->isAtom();
1248
1249  setPrefFunctionAlignment(4); // 2^4 bytes.
1250}
1251
1252
1253EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
1254  if (!VT.isVector()) return MVT::i8;
1255  return VT.changeVectorElementTypeToInteger();
1256}
1257
1258
1259/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1260/// the desired ByVal argument alignment.
1261static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1262  if (MaxAlign == 16)
1263    return;
1264  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1265    if (VTy->getBitWidth() == 128)
1266      MaxAlign = 16;
1267  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1268    unsigned EltAlign = 0;
1269    getMaxByValAlign(ATy->getElementType(), EltAlign);
1270    if (EltAlign > MaxAlign)
1271      MaxAlign = EltAlign;
1272  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1273    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1274      unsigned EltAlign = 0;
1275      getMaxByValAlign(STy->getElementType(i), EltAlign);
1276      if (EltAlign > MaxAlign)
1277        MaxAlign = EltAlign;
1278      if (MaxAlign == 16)
1279        break;
1280    }
1281  }
1282}
1283
1284/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1285/// function arguments in the caller parameter area. For X86, aggregates
1286/// that contain SSE vectors are placed at 16-byte boundaries while the rest
1287/// are at 4-byte boundaries.
1288unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1289  if (Subtarget->is64Bit()) {
1290    // Max of 8 and alignment of type.
1291    unsigned TyAlign = TD->getABITypeAlignment(Ty);
1292    if (TyAlign > 8)
1293      return TyAlign;
1294    return 8;
1295  }
1296
1297  unsigned Align = 4;
1298  if (Subtarget->hasSSE1())
1299    getMaxByValAlign(Ty, Align);
1300  return Align;
1301}
1302
1303/// getOptimalMemOpType - Returns the target specific optimal type for load
1304/// and store operations as a result of memset, memcpy, and memmove
1305/// lowering. If DstAlign is zero that means it's safe to destination
1306/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1307/// means there isn't a need to check it against alignment requirement,
1308/// probably because the source does not need to be loaded. If
1309/// 'IsZeroVal' is true, that means it's safe to return a
1310/// non-scalar-integer type, e.g. empty string source, constant, or loaded
1311/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
1312/// constant so it does not need to be loaded.
1313/// It returns EVT::Other if the type should be determined using generic
1314/// target-independent logic.
1315EVT
1316X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1317                                       unsigned DstAlign, unsigned SrcAlign,
1318                                       bool IsZeroVal,
1319                                       bool MemcpyStrSrc,
1320                                       MachineFunction &MF) const {
1321  // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
1322  // linux.  This is because the stack realignment code can't handle certain
1323  // cases like PR2962.  This should be removed when PR2962 is fixed.
1324  const Function *F = MF.getFunction();
1325  if (IsZeroVal &&
1326      !F->hasFnAttr(Attribute::NoImplicitFloat)) {
1327    if (Size >= 16 &&
1328        (Subtarget->isUnalignedMemAccessFast() ||
1329         ((DstAlign == 0 || DstAlign >= 16) &&
1330          (SrcAlign == 0 || SrcAlign >= 16))) &&
1331        Subtarget->getStackAlignment() >= 16) {
1332      if (Subtarget->getStackAlignment() >= 32) {
1333        if (Subtarget->hasAVX2())
1334          return MVT::v8i32;
1335        if (Subtarget->hasAVX())
1336          return MVT::v8f32;
1337      }
1338      if (Subtarget->hasSSE2())
1339        return MVT::v4i32;
1340      if (Subtarget->hasSSE1())
1341        return MVT::v4f32;
1342    } else if (!MemcpyStrSrc && Size >= 8 &&
1343               !Subtarget->is64Bit() &&
1344               Subtarget->getStackAlignment() >= 8 &&
1345               Subtarget->hasSSE2()) {
1346      // Do not use f64 to lower memcpy if source is string constant. It's
1347      // better to use i32 to avoid the loads.
1348      return MVT::f64;
1349    }
1350  }
1351  if (Subtarget->is64Bit() && Size >= 8)
1352    return MVT::i64;
1353  return MVT::i32;
1354}
1355
1356/// getJumpTableEncoding - Return the entry encoding for a jump table in the
1357/// current function.  The returned value is a member of the
1358/// MachineJumpTableInfo::JTEntryKind enum.
1359unsigned X86TargetLowering::getJumpTableEncoding() const {
1360  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1361  // symbol.
1362  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1363      Subtarget->isPICStyleGOT())
1364    return MachineJumpTableInfo::EK_Custom32;
1365
1366  // Otherwise, use the normal jump table encoding heuristics.
1367  return TargetLowering::getJumpTableEncoding();
1368}
1369
1370const MCExpr *
1371X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1372                                             const MachineBasicBlock *MBB,
1373                                             unsigned uid,MCContext &Ctx) const{
1374  assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1375         Subtarget->isPICStyleGOT());
1376  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1377  // entries.
1378  return MCSymbolRefExpr::Create(MBB->getSymbol(),
1379                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1380}
1381
1382/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
1383/// jumptable.
1384SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1385                                                    SelectionDAG &DAG) const {
1386  if (!Subtarget->is64Bit())
1387    // This doesn't have DebugLoc associated with it, but is not really the
1388    // same as a Register.
1389    return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy());
1390  return Table;
1391}
1392
1393/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
1394/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
1395/// MCExpr.
1396const MCExpr *X86TargetLowering::
1397getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1398                             MCContext &Ctx) const {
1399  // X86-64 uses RIP relative addressing based on the jump table label.
1400  if (Subtarget->isPICStyleRIPRel())
1401    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1402
1403  // Otherwise, the reference is relative to the PIC base.
1404  return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1405}
1406
1407// FIXME: Why this routine is here? Move to RegInfo!
1408std::pair<const TargetRegisterClass*, uint8_t>
1409X86TargetLowering::findRepresentativeClass(EVT VT) const{
1410  const TargetRegisterClass *RRC = 0;
1411  uint8_t Cost = 1;
1412  switch (VT.getSimpleVT().SimpleTy) {
1413  default:
1414    return TargetLowering::findRepresentativeClass(VT);
1415  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1416    RRC = Subtarget->is64Bit() ?
1417      (const TargetRegisterClass*)&X86::GR64RegClass :
1418      (const TargetRegisterClass*)&X86::GR32RegClass;
1419    break;
1420  case MVT::x86mmx:
1421    RRC = &X86::VR64RegClass;
1422    break;
1423  case MVT::f32: case MVT::f64:
1424  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1425  case MVT::v4f32: case MVT::v2f64:
1426  case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1427  case MVT::v4f64:
1428    RRC = &X86::VR128RegClass;
1429    break;
1430  }
1431  return std::make_pair(RRC, Cost);
1432}
1433
1434bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1435                                               unsigned &Offset) const {
1436  if (!Subtarget->isTargetLinux())
1437    return false;
1438
1439  if (Subtarget->is64Bit()) {
1440    // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1441    Offset = 0x28;
1442    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1443      AddressSpace = 256;
1444    else
1445      AddressSpace = 257;
1446  } else {
1447    // %gs:0x14 on i386
1448    Offset = 0x14;
1449    AddressSpace = 256;
1450  }
1451  return true;
1452}
1453
1454
1455//===----------------------------------------------------------------------===//
1456//               Return Value Calling Convention Implementation
1457//===----------------------------------------------------------------------===//
1458
1459#include "X86GenCallingConv.inc"
1460
1461bool
1462X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
1463                                  MachineFunction &MF, bool isVarArg,
1464                        const SmallVectorImpl<ISD::OutputArg> &Outs,
1465                        LLVMContext &Context) const {
1466  SmallVector<CCValAssign, 16> RVLocs;
1467  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1468                 RVLocs, Context);
1469  return CCInfo.CheckReturn(Outs, RetCC_X86);
1470}
1471
1472SDValue
1473X86TargetLowering::LowerReturn(SDValue Chain,
1474                               CallingConv::ID CallConv, bool isVarArg,
1475                               const SmallVectorImpl<ISD::OutputArg> &Outs,
1476                               const SmallVectorImpl<SDValue> &OutVals,
1477                               DebugLoc dl, SelectionDAG &DAG) const {
1478  MachineFunction &MF = DAG.getMachineFunction();
1479  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1480
1481  SmallVector<CCValAssign, 16> RVLocs;
1482  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1483                 RVLocs, *DAG.getContext());
1484  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1485
1486  // Add the regs to the liveout set for the function.
1487  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1488  for (unsigned i = 0; i != RVLocs.size(); ++i)
1489    if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg()))
1490      MRI.addLiveOut(RVLocs[i].getLocReg());
1491
1492  SDValue Flag;
1493
1494  SmallVector<SDValue, 6> RetOps;
1495  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1496  // Operand #1 = Bytes To Pop
1497  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
1498                   MVT::i16));
1499
1500  // Copy the result values into the output registers.
1501  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1502    CCValAssign &VA = RVLocs[i];
1503    assert(VA.isRegLoc() && "Can only return in registers!");
1504    SDValue ValToCopy = OutVals[i];
1505    EVT ValVT = ValToCopy.getValueType();
1506
1507    // Promote values to the appropriate types
1508    if (VA.getLocInfo() == CCValAssign::SExt)
1509      ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
1510    else if (VA.getLocInfo() == CCValAssign::ZExt)
1511      ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
1512    else if (VA.getLocInfo() == CCValAssign::AExt)
1513      ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
1514    else if (VA.getLocInfo() == CCValAssign::BCvt)
1515      ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
1516
1517    // If this is x86-64, and we disabled SSE, we can't return FP values,
1518    // or SSE or MMX vectors.
1519    if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
1520         VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
1521          (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
1522      report_fatal_error("SSE register return with SSE disabled");
1523    }
1524    // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
1525    // llvm-gcc has never done it right and no one has noticed, so this
1526    // should be OK for now.
1527    if (ValVT == MVT::f64 &&
1528        (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
1529      report_fatal_error("SSE2 register return with SSE2 disabled");
1530
1531    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
1532    // the RET instruction and handled by the FP Stackifier.
1533    if (VA.getLocReg() == X86::ST0 ||
1534        VA.getLocReg() == X86::ST1) {
1535      // If this is a copy from an xmm register to ST(0), use an FPExtend to
1536      // change the value to the FP stack register class.
1537      if (isScalarFPTypeInSSEReg(VA.getValVT()))
1538        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
1539      RetOps.push_back(ValToCopy);
1540      // Don't emit a copytoreg.
1541      continue;
1542    }
1543
1544    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
1545    // which is returned in RAX / RDX.
1546    if (Subtarget->is64Bit()) {
1547      if (ValVT == MVT::x86mmx) {
1548        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
1549          ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
1550          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
1551                                  ValToCopy);
1552          // If we don't have SSE2 available, convert to v4f32 so the generated
1553          // register is legal.
1554          if (!Subtarget->hasSSE2())
1555            ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
1556        }
1557      }
1558    }
1559
1560    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
1561    Flag = Chain.getValue(1);
1562  }
1563
1564  // The x86-64 ABI for returning structs by value requires that we copy
1565  // the sret argument into %rax for the return. We saved the argument into
1566  // a virtual register in the entry block, so now we copy the value out
1567  // and into %rax.
1568  if (Subtarget->is64Bit() &&
1569      DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
1570    MachineFunction &MF = DAG.getMachineFunction();
1571    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1572    unsigned Reg = FuncInfo->getSRetReturnReg();
1573    assert(Reg &&
1574           "SRetReturnReg should have been set in LowerFormalArguments().");
1575    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
1576
1577    Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
1578    Flag = Chain.getValue(1);
1579
1580    // RAX now acts like a return value.
1581    MRI.addLiveOut(X86::RAX);
1582  }
1583
1584  RetOps[0] = Chain;  // Update chain.
1585
1586  // Add the flag if we have it.
1587  if (Flag.getNode())
1588    RetOps.push_back(Flag);
1589
1590  return DAG.getNode(X86ISD::RET_FLAG, dl,
1591                     MVT::Other, &RetOps[0], RetOps.size());
1592}
1593
1594bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
1595  if (N->getNumValues() != 1)
1596    return false;
1597  if (!N->hasNUsesOfValue(1, 0))
1598    return false;
1599
1600  SDValue TCChain = Chain;
1601  SDNode *Copy = *N->use_begin();
1602  if (Copy->getOpcode() == ISD::CopyToReg) {
1603    // If the copy has a glue operand, we conservatively assume it isn't safe to
1604    // perform a tail call.
1605    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
1606      return false;
1607    TCChain = Copy->getOperand(0);
1608  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
1609    return false;
1610
1611  bool HasRet = false;
1612  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
1613       UI != UE; ++UI) {
1614    if (UI->getOpcode() != X86ISD::RET_FLAG)
1615      return false;
1616    HasRet = true;
1617  }
1618
1619  if (!HasRet)
1620    return false;
1621
1622  Chain = TCChain;
1623  return true;
1624}
1625
1626EVT
1627X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
1628                                            ISD::NodeType ExtendKind) const {
1629  MVT ReturnMVT;
1630  // TODO: Is this also valid on 32-bit?
1631  if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
1632    ReturnMVT = MVT::i8;
1633  else
1634    ReturnMVT = MVT::i32;
1635
1636  EVT MinVT = getRegisterType(Context, ReturnMVT);
1637  return VT.bitsLT(MinVT) ? MinVT : VT;
1638}
1639
1640/// LowerCallResult - Lower the result values of a call into the
1641/// appropriate copies out of appropriate physical registers.
1642///
1643SDValue
1644X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1645                                   CallingConv::ID CallConv, bool isVarArg,
1646                                   const SmallVectorImpl<ISD::InputArg> &Ins,
1647                                   DebugLoc dl, SelectionDAG &DAG,
1648                                   SmallVectorImpl<SDValue> &InVals) const {
1649
1650  // Assign locations to each value returned by this call.
1651  SmallVector<CCValAssign, 16> RVLocs;
1652  bool Is64Bit = Subtarget->is64Bit();
1653  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1654                 getTargetMachine(), RVLocs, *DAG.getContext());
1655  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1656
1657  // Copy all of the result registers out of their specified physreg.
1658  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1659    CCValAssign &VA = RVLocs[i];
1660    EVT CopyVT = VA.getValVT();
1661
1662    // If this is x86-64, and we disabled SSE, we can't return FP values
1663    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
1664        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
1665      report_fatal_error("SSE register return with SSE disabled");
1666    }
1667
1668    SDValue Val;
1669
1670    // If this is a call to a function that returns an fp value on the floating
1671    // point stack, we must guarantee the value is popped from the stack, so
1672    // a CopyFromReg is not good enough - the copy instruction may be eliminated
1673    // if the return value is not used. We use the FpPOP_RETVAL instruction
1674    // instead.
1675    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
1676      // If we prefer to use the value in xmm registers, copy it out as f80 and
1677      // use a truncate to move it from fp stack reg to xmm reg.
1678      if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
1679      SDValue Ops[] = { Chain, InFlag };
1680      Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
1681                                         MVT::Other, MVT::Glue, Ops, 2), 1);
1682      Val = Chain.getValue(0);
1683
1684      // Round the f80 to the right size, which also moves it to the appropriate
1685      // xmm register.
1686      if (CopyVT != VA.getValVT())
1687        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1688                          // This truncation won't change the value.
1689                          DAG.getIntPtrConstant(1));
1690    } else {
1691      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1692                                 CopyVT, InFlag).getValue(1);
1693      Val = Chain.getValue(0);
1694    }
1695    InFlag = Chain.getValue(2);
1696    InVals.push_back(Val);
1697  }
1698
1699  return Chain;
1700}
1701
1702
1703//===----------------------------------------------------------------------===//
1704//                C & StdCall & Fast Calling Convention implementation
1705//===----------------------------------------------------------------------===//
1706//  StdCall calling convention seems to be standard for many Windows' API
1707//  routines and around. It differs from C calling convention just a little:
1708//  callee should clean up the stack, not caller. Symbols should be also
1709//  decorated in some fancy way :) It doesn't support any vector arguments.
1710//  For info on fast calling convention see Fast Calling Convention (tail call)
1711//  implementation LowerX86_32FastCCCallTo.
1712
1713/// CallIsStructReturn - Determines whether a call uses struct return
1714/// semantics.
1715enum StructReturnType {
1716  NotStructReturn,
1717  RegStructReturn,
1718  StackStructReturn
1719};
1720static StructReturnType
1721callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
1722  if (Outs.empty())
1723    return NotStructReturn;
1724
1725  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
1726  if (!Flags.isSRet())
1727    return NotStructReturn;
1728  if (Flags.isInReg())
1729    return RegStructReturn;
1730  return StackStructReturn;
1731}
1732
1733/// ArgsAreStructReturn - Determines whether a function uses struct
1734/// return semantics.
1735static StructReturnType
1736argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
1737  if (Ins.empty())
1738    return NotStructReturn;
1739
1740  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
1741  if (!Flags.isSRet())
1742    return NotStructReturn;
1743  if (Flags.isInReg())
1744    return RegStructReturn;
1745  return StackStructReturn;
1746}
1747
1748/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
1749/// by "Src" to address "Dst" with size and alignment information specified by
1750/// the specific parameter attribute. The copy will be passed as a byval
1751/// function parameter.
1752static SDValue
1753CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
1754                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1755                          DebugLoc dl) {
1756  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
1757
1758  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
1759                       /*isVolatile*/false, /*AlwaysInline=*/true,
1760                       MachinePointerInfo(), MachinePointerInfo());
1761}
1762
1763/// IsTailCallConvention - Return true if the calling convention is one that
1764/// supports tail call optimization.
1765static bool IsTailCallConvention(CallingConv::ID CC) {
1766  return (CC == CallingConv::Fast || CC == CallingConv::GHC);
1767}
1768
1769bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
1770  if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
1771    return false;
1772
1773  CallSite CS(CI);
1774  CallingConv::ID CalleeCC = CS.getCallingConv();
1775  if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
1776    return false;
1777
1778  return true;
1779}
1780
1781/// FuncIsMadeTailCallSafe - Return true if the function is being made into
1782/// a tailcall target by changing its ABI.
1783static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
1784                                   bool GuaranteedTailCallOpt) {
1785  return GuaranteedTailCallOpt && IsTailCallConvention(CC);
1786}
1787
1788SDValue
1789X86TargetLowering::LowerMemArgument(SDValue Chain,
1790                                    CallingConv::ID CallConv,
1791                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1792                                    DebugLoc dl, SelectionDAG &DAG,
1793                                    const CCValAssign &VA,
1794                                    MachineFrameInfo *MFI,
1795                                    unsigned i) const {
1796  // Create the nodes corresponding to a load from this parameter slot.
1797  ISD::ArgFlagsTy Flags = Ins[i].Flags;
1798  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
1799                              getTargetMachine().Options.GuaranteedTailCallOpt);
1800  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1801  EVT ValVT;
1802
1803  // If value is passed by pointer we have address passed instead of the value
1804  // itself.
1805  if (VA.getLocInfo() == CCValAssign::Indirect)
1806    ValVT = VA.getLocVT();
1807  else
1808    ValVT = VA.getValVT();
1809
1810  // FIXME: For now, all byval parameter objects are marked mutable. This can be
1811  // changed with more analysis.
1812  // In case of tail call optimization mark all arguments mutable. Since they
1813  // could be overwritten by lowering of arguments in case of a tail call.
1814  if (Flags.isByVal()) {
1815    unsigned Bytes = Flags.getByValSize();
1816    if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
1817    int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
1818    return DAG.getFrameIndex(FI, getPointerTy());
1819  } else {
1820    int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
1821                                    VA.getLocMemOffset(), isImmutable);
1822    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1823    return DAG.getLoad(ValVT, dl, Chain, FIN,
1824                       MachinePointerInfo::getFixedStack(FI),
1825                       false, false, false, 0);
1826  }
1827}
1828
1829SDValue
1830X86TargetLowering::LowerFormalArguments(SDValue Chain,
1831                                        CallingConv::ID CallConv,
1832                                        bool isVarArg,
1833                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1834                                        DebugLoc dl,
1835                                        SelectionDAG &DAG,
1836                                        SmallVectorImpl<SDValue> &InVals)
1837                                          const {
1838  MachineFunction &MF = DAG.getMachineFunction();
1839  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1840
1841  const Function* Fn = MF.getFunction();
1842  if (Fn->hasExternalLinkage() &&
1843      Subtarget->isTargetCygMing() &&
1844      Fn->getName() == "main")
1845    FuncInfo->setForceFramePointer(true);
1846
1847  MachineFrameInfo *MFI = MF.getFrameInfo();
1848  bool Is64Bit = Subtarget->is64Bit();
1849  bool IsWindows = Subtarget->isTargetWindows();
1850  bool IsWin64 = Subtarget->isTargetWin64();
1851
1852  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
1853         "Var args not supported with calling convention fastcc or ghc");
1854
1855  // Assign locations to all of the incoming arguments.
1856  SmallVector<CCValAssign, 16> ArgLocs;
1857  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
1858                 ArgLocs, *DAG.getContext());
1859
1860  // Allocate shadow area for Win64
1861  if (IsWin64) {
1862    CCInfo.AllocateStack(32, 8);
1863  }
1864
1865  CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
1866
1867  unsigned LastVal = ~0U;
1868  SDValue ArgValue;
1869  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1870    CCValAssign &VA = ArgLocs[i];
1871    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
1872    // places.
1873    assert(VA.getValNo() != LastVal &&
1874           "Don't support value assigned to multiple locs yet");
1875    (void)LastVal;
1876    LastVal = VA.getValNo();
1877
1878    if (VA.isRegLoc()) {
1879      EVT RegVT = VA.getLocVT();
1880      const TargetRegisterClass *RC;
1881      if (RegVT == MVT::i32)
1882        RC = &X86::GR32RegClass;
1883      else if (Is64Bit && RegVT == MVT::i64)
1884        RC = &X86::GR64RegClass;
1885      else if (RegVT == MVT::f32)
1886        RC = &X86::FR32RegClass;
1887      else if (RegVT == MVT::f64)
1888        RC = &X86::FR64RegClass;
1889      else if (RegVT.is256BitVector())
1890        RC = &X86::VR256RegClass;
1891      else if (RegVT.is128BitVector())
1892        RC = &X86::VR128RegClass;
1893      else if (RegVT == MVT::x86mmx)
1894        RC = &X86::VR64RegClass;
1895      else
1896        llvm_unreachable("Unknown argument type!");
1897
1898      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1899      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1900
1901      // If this is an 8 or 16-bit value, it is really passed promoted to 32
1902      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
1903      // right size.
1904      if (VA.getLocInfo() == CCValAssign::SExt)
1905        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
1906                               DAG.getValueType(VA.getValVT()));
1907      else if (VA.getLocInfo() == CCValAssign::ZExt)
1908        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
1909                               DAG.getValueType(VA.getValVT()));
1910      else if (VA.getLocInfo() == CCValAssign::BCvt)
1911        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
1912
1913      if (VA.isExtInLoc()) {
1914        // Handle MMX values passed in XMM regs.
1915        if (RegVT.isVector()) {
1916          ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(),
1917                                 ArgValue);
1918        } else
1919          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
1920      }
1921    } else {
1922      assert(VA.isMemLoc());
1923      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
1924    }
1925
1926    // If value is passed via pointer - do a load.
1927    if (VA.getLocInfo() == CCValAssign::Indirect)
1928      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
1929                             MachinePointerInfo(), false, false, false, 0);
1930
1931    InVals.push_back(ArgValue);
1932  }
1933
1934  // The x86-64 ABI for returning structs by value requires that we copy
1935  // the sret argument into %rax for the return. Save the argument into
1936  // a virtual register so that we can access it from the return points.
1937  if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
1938    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1939    unsigned Reg = FuncInfo->getSRetReturnReg();
1940    if (!Reg) {
1941      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
1942      FuncInfo->setSRetReturnReg(Reg);
1943    }
1944    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
1945    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
1946  }
1947
1948  unsigned StackSize = CCInfo.getNextStackOffset();
1949  // Align stack specially for tail calls.
1950  if (FuncIsMadeTailCallSafe(CallConv,
1951                             MF.getTarget().Options.GuaranteedTailCallOpt))
1952    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1953
1954  // If the function takes variable number of arguments, make a frame index for
1955  // the start of the first vararg value... for expansion of llvm.va_start.
1956  if (isVarArg) {
1957    if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
1958                    CallConv != CallingConv::X86_ThisCall)) {
1959      FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
1960    }
1961    if (Is64Bit) {
1962      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
1963
1964      // FIXME: We should really autogenerate these arrays
1965      static const uint16_t GPR64ArgRegsWin64[] = {
1966        X86::RCX, X86::RDX, X86::R8,  X86::R9
1967      };
1968      static const uint16_t GPR64ArgRegs64Bit[] = {
1969        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1970      };
1971      static const uint16_t XMMArgRegs64Bit[] = {
1972        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1973        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1974      };
1975      const uint16_t *GPR64ArgRegs;
1976      unsigned NumXMMRegs = 0;
1977
1978      if (IsWin64) {
1979        // The XMM registers which might contain var arg parameters are shadowed
1980        // in their paired GPR.  So we only need to save the GPR to their home
1981        // slots.
1982        TotalNumIntRegs = 4;
1983        GPR64ArgRegs = GPR64ArgRegsWin64;
1984      } else {
1985        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
1986        GPR64ArgRegs = GPR64ArgRegs64Bit;
1987
1988        NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
1989                                                TotalNumXMMRegs);
1990      }
1991      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
1992                                                       TotalNumIntRegs);
1993
1994      bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
1995      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
1996             "SSE register cannot be used when SSE is disabled!");
1997      assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
1998               NoImplicitFloatOps) &&
1999             "SSE register cannot be used when SSE is disabled!");
2000      if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2001          !Subtarget->hasSSE1())
2002        // Kernel mode asks for SSE to be disabled, so don't push them
2003        // on the stack.
2004        TotalNumXMMRegs = 0;
2005
2006      if (IsWin64) {
2007        const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
2008        // Get to the caller-allocated home save location.  Add 8 to account
2009        // for the return address.
2010        int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2011        FuncInfo->setRegSaveFrameIndex(
2012          MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2013        // Fixup to set vararg frame on shadow area (4 x i64).
2014        if (NumIntRegs < 4)
2015          FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2016      } else {
2017        // For X86-64, if there are vararg parameters that are passed via
2018        // registers, then we must store them to their spots on the stack so
2019        // they may be loaded by deferencing the result of va_next.
2020        FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2021        FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
2022        FuncInfo->setRegSaveFrameIndex(
2023          MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
2024                               false));
2025      }
2026
2027      // Store the integer parameter registers.
2028      SmallVector<SDValue, 8> MemOps;
2029      SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2030                                        getPointerTy());
2031      unsigned Offset = FuncInfo->getVarArgsGPOffset();
2032      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
2033        SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2034                                  DAG.getIntPtrConstant(Offset));
2035        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
2036                                     &X86::GR64RegClass);
2037        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
2038        SDValue Store =
2039          DAG.getStore(Val.getValue(1), dl, Val, FIN,
2040                       MachinePointerInfo::getFixedStack(
2041                         FuncInfo->getRegSaveFrameIndex(), Offset),
2042                       false, false, 0);
2043        MemOps.push_back(Store);
2044        Offset += 8;
2045      }
2046
2047      if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
2048        // Now store the XMM (fp + vector) parameter registers.
2049        SmallVector<SDValue, 11> SaveXMMOps;
2050        SaveXMMOps.push_back(Chain);
2051
2052        unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2053        SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
2054        SaveXMMOps.push_back(ALVal);
2055
2056        SaveXMMOps.push_back(DAG.getIntPtrConstant(
2057                               FuncInfo->getRegSaveFrameIndex()));
2058        SaveXMMOps.push_back(DAG.getIntPtrConstant(
2059                               FuncInfo->getVarArgsFPOffset()));
2060
2061        for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
2062          unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
2063                                       &X86::VR128RegClass);
2064          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
2065          SaveXMMOps.push_back(Val);
2066        }
2067        MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2068                                     MVT::Other,
2069                                     &SaveXMMOps[0], SaveXMMOps.size()));
2070      }
2071
2072      if (!MemOps.empty())
2073        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2074                            &MemOps[0], MemOps.size());
2075    }
2076  }
2077
2078  // Some CCs need callee pop.
2079  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2080                       MF.getTarget().Options.GuaranteedTailCallOpt)) {
2081    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2082  } else {
2083    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2084    // If this is an sret function, the return should pop the hidden pointer.
2085    if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
2086        argsAreStructReturn(Ins) == StackStructReturn)
2087      FuncInfo->setBytesToPopOnReturn(4);
2088  }
2089
2090  if (!Is64Bit) {
2091    // RegSaveFrameIndex is X86-64 only.
2092    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2093    if (CallConv == CallingConv::X86_FastCall ||
2094        CallConv == CallingConv::X86_ThisCall)
2095      // fastcc functions can't have varargs.
2096      FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2097  }
2098
2099  FuncInfo->setArgumentStackSize(StackSize);
2100
2101  return Chain;
2102}
2103
2104SDValue
2105X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2106                                    SDValue StackPtr, SDValue Arg,
2107                                    DebugLoc dl, SelectionDAG &DAG,
2108                                    const CCValAssign &VA,
2109                                    ISD::ArgFlagsTy Flags) const {
2110  unsigned LocMemOffset = VA.getLocMemOffset();
2111  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2112  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2113  if (Flags.isByVal())
2114    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2115
2116  return DAG.getStore(Chain, dl, Arg, PtrOff,
2117                      MachinePointerInfo::getStack(LocMemOffset),
2118                      false, false, 0);
2119}
2120
2121/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
2122/// optimization is performed and it is required.
2123SDValue
2124X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2125                                           SDValue &OutRetAddr, SDValue Chain,
2126                                           bool IsTailCall, bool Is64Bit,
2127                                           int FPDiff, DebugLoc dl) const {
2128  // Adjust the Return address stack slot.
2129  EVT VT = getPointerTy();
2130  OutRetAddr = getReturnAddressFrameIndex(DAG);
2131
2132  // Load the "old" Return address.
2133  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2134                           false, false, false, 0);
2135  return SDValue(OutRetAddr.getNode(), 1);
2136}
2137
2138/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
2139/// optimization is performed and it is required (FPDiff!=0).
2140static SDValue
2141EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
2142                         SDValue Chain, SDValue RetAddrFrIdx,
2143                         bool Is64Bit, int FPDiff, DebugLoc dl) {
2144  // Store the return address to the appropriate stack slot.
2145  if (!FPDiff) return Chain;
2146  // Calculate the new stack slot for the return address.
2147  int SlotSize = Is64Bit ? 8 : 4;
2148  int NewReturnAddrFI =
2149    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
2150  EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
2151  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
2152  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2153                       MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2154                       false, false, 0);
2155  return Chain;
2156}
2157
2158SDValue
2159X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2160                             SmallVectorImpl<SDValue> &InVals) const {
2161  SelectionDAG &DAG                     = CLI.DAG;
2162  DebugLoc &dl                          = CLI.DL;
2163  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
2164  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
2165  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
2166  SDValue Chain                         = CLI.Chain;
2167  SDValue Callee                        = CLI.Callee;
2168  CallingConv::ID CallConv              = CLI.CallConv;
2169  bool &isTailCall                      = CLI.IsTailCall;
2170  bool isVarArg                         = CLI.IsVarArg;
2171
2172  MachineFunction &MF = DAG.getMachineFunction();
2173  bool Is64Bit        = Subtarget->is64Bit();
2174  bool IsWin64        = Subtarget->isTargetWin64();
2175  bool IsWindows      = Subtarget->isTargetWindows();
2176  StructReturnType SR = callIsStructReturn(Outs);
2177  bool IsSibcall      = false;
2178
2179  if (MF.getTarget().Options.DisableTailCalls)
2180    isTailCall = false;
2181
2182  if (isTailCall) {
2183    // Check if it's really possible to do a tail call.
2184    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2185                    isVarArg, SR != NotStructReturn,
2186                    MF.getFunction()->hasStructRetAttr(),
2187                    Outs, OutVals, Ins, DAG);
2188
2189    // Sibcalls are automatically detected tailcalls which do not require
2190    // ABI changes.
2191    if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2192      IsSibcall = true;
2193
2194    if (isTailCall)
2195      ++NumTailCalls;
2196  }
2197
2198  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2199         "Var args not supported with calling convention fastcc or ghc");
2200
2201  // Analyze operands of the call, assigning locations to each operand.
2202  SmallVector<CCValAssign, 16> ArgLocs;
2203  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
2204                 ArgLocs, *DAG.getContext());
2205
2206  // Allocate shadow area for Win64
2207  if (IsWin64) {
2208    CCInfo.AllocateStack(32, 8);
2209  }
2210
2211  CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2212
2213  // Get a count of how many bytes are to be pushed on the stack.
2214  unsigned NumBytes = CCInfo.getNextStackOffset();
2215  if (IsSibcall)
2216    // This is a sibcall. The memory operands are available in caller's
2217    // own caller's stack.
2218    NumBytes = 0;
2219  else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
2220           IsTailCallConvention(CallConv))
2221    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2222
2223  int FPDiff = 0;
2224  if (isTailCall && !IsSibcall) {
2225    // Lower arguments at fp - stackoffset + fpdiff.
2226    unsigned NumBytesCallerPushed =
2227      MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
2228    FPDiff = NumBytesCallerPushed - NumBytes;
2229
2230    // Set the delta of movement of the returnaddr stackslot.
2231    // But only set if delta is greater than previous delta.
2232    if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
2233      MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
2234  }
2235
2236  if (!IsSibcall)
2237    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
2238
2239  SDValue RetAddrFrIdx;
2240  // Load return address for tail calls.
2241  if (isTailCall && FPDiff)
2242    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2243                                    Is64Bit, FPDiff, dl);
2244
2245  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2246  SmallVector<SDValue, 8> MemOpChains;
2247  SDValue StackPtr;
2248
2249  // Walk the register/memloc assignments, inserting copies/loads.  In the case
2250  // of tail call optimization arguments are handle later.
2251  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2252    CCValAssign &VA = ArgLocs[i];
2253    EVT RegVT = VA.getLocVT();
2254    SDValue Arg = OutVals[i];
2255    ISD::ArgFlagsTy Flags = Outs[i].Flags;
2256    bool isByVal = Flags.isByVal();
2257
2258    // Promote the value if needed.
2259    switch (VA.getLocInfo()) {
2260    default: llvm_unreachable("Unknown loc info!");
2261    case CCValAssign::Full: break;
2262    case CCValAssign::SExt:
2263      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2264      break;
2265    case CCValAssign::ZExt:
2266      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2267      break;
2268    case CCValAssign::AExt:
2269      if (RegVT.is128BitVector()) {
2270        // Special case: passing MMX values in XMM registers.
2271        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2272        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2273        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2274      } else
2275        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2276      break;
2277    case CCValAssign::BCvt:
2278      Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2279      break;
2280    case CCValAssign::Indirect: {
2281      // Store the argument.
2282      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2283      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2284      Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2285                           MachinePointerInfo::getFixedStack(FI),
2286                           false, false, 0);
2287      Arg = SpillSlot;
2288      break;
2289    }
2290    }
2291
2292    if (VA.isRegLoc()) {
2293      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2294      if (isVarArg && IsWin64) {
2295        // Win64 ABI requires argument XMM reg to be copied to the corresponding
2296        // shadow reg if callee is a varargs function.
2297        unsigned ShadowReg = 0;
2298        switch (VA.getLocReg()) {
2299        case X86::XMM0: ShadowReg = X86::RCX; break;
2300        case X86::XMM1: ShadowReg = X86::RDX; break;
2301        case X86::XMM2: ShadowReg = X86::R8; break;
2302        case X86::XMM3: ShadowReg = X86::R9; break;
2303        }
2304        if (ShadowReg)
2305          RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2306      }
2307    } else if (!IsSibcall && (!isTailCall || isByVal)) {
2308      assert(VA.isMemLoc());
2309      if (StackPtr.getNode() == 0)
2310        StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
2311      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2312                                             dl, DAG, VA, Flags));
2313    }
2314  }
2315
2316  if (!MemOpChains.empty())
2317    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2318                        &MemOpChains[0], MemOpChains.size());
2319
2320  if (Subtarget->isPICStyleGOT()) {
2321    // ELF / PIC requires GOT in the EBX register before function calls via PLT
2322    // GOT pointer.
2323    if (!isTailCall) {
2324      RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2325               DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy())));
2326    } else {
2327      // If we are tail calling and generating PIC/GOT style code load the
2328      // address of the callee into ECX. The value in ecx is used as target of
2329      // the tail jump. This is done to circumvent the ebx/callee-saved problem
2330      // for tail calls on PIC/GOT architectures. Normally we would just put the
2331      // address of GOT into ebx and then call target@PLT. But for tail calls
2332      // ebx would be restored (since ebx is callee saved) before jumping to the
2333      // target@PLT.
2334
2335      // Note: The actual moving to ECX is done further down.
2336      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2337      if (G && !G->getGlobal()->hasHiddenVisibility() &&
2338          !G->getGlobal()->hasProtectedVisibility())
2339        Callee = LowerGlobalAddress(Callee, DAG);
2340      else if (isa<ExternalSymbolSDNode>(Callee))
2341        Callee = LowerExternalSymbol(Callee, DAG);
2342    }
2343  }
2344
2345  if (Is64Bit && isVarArg && !IsWin64) {
2346    // From AMD64 ABI document:
2347    // For calls that may call functions that use varargs or stdargs
2348    // (prototype-less calls or calls to functions containing ellipsis (...) in
2349    // the declaration) %al is used as hidden argument to specify the number
2350    // of SSE registers used. The contents of %al do not need to match exactly
2351    // the number of registers, but must be an ubound on the number of SSE
2352    // registers used and is in the range 0 - 8 inclusive.
2353
2354    // Count the number of XMM registers allocated.
2355    static const uint16_t XMMArgRegs[] = {
2356      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2357      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2358    };
2359    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
2360    assert((Subtarget->hasSSE1() || !NumXMMRegs)
2361           && "SSE registers cannot be used when SSE is disabled");
2362
2363    RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
2364                                        DAG.getConstant(NumXMMRegs, MVT::i8)));
2365  }
2366
2367  // For tail calls lower the arguments to the 'real' stack slot.
2368  if (isTailCall) {
2369    // Force all the incoming stack arguments to be loaded from the stack
2370    // before any new outgoing arguments are stored to the stack, because the
2371    // outgoing stack slots may alias the incoming argument stack slots, and
2372    // the alias isn't otherwise explicit. This is slightly more conservative
2373    // than necessary, because it means that each store effectively depends
2374    // on every argument instead of just those arguments it would clobber.
2375    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
2376
2377    SmallVector<SDValue, 8> MemOpChains2;
2378    SDValue FIN;
2379    int FI = 0;
2380    if (getTargetMachine().Options.GuaranteedTailCallOpt) {
2381      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2382        CCValAssign &VA = ArgLocs[i];
2383        if (VA.isRegLoc())
2384          continue;
2385        assert(VA.isMemLoc());
2386        SDValue Arg = OutVals[i];
2387        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2388        // Create frame index.
2389        int32_t Offset = VA.getLocMemOffset()+FPDiff;
2390        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
2391        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
2392        FIN = DAG.getFrameIndex(FI, getPointerTy());
2393
2394        if (Flags.isByVal()) {
2395          // Copy relative to framepointer.
2396          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
2397          if (StackPtr.getNode() == 0)
2398            StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
2399                                          getPointerTy());
2400          Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
2401
2402          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
2403                                                           ArgChain,
2404                                                           Flags, DAG, dl));
2405        } else {
2406          // Store relative to framepointer.
2407          MemOpChains2.push_back(
2408            DAG.getStore(ArgChain, dl, Arg, FIN,
2409                         MachinePointerInfo::getFixedStack(FI),
2410                         false, false, 0));
2411        }
2412      }
2413    }
2414
2415    if (!MemOpChains2.empty())
2416      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2417                          &MemOpChains2[0], MemOpChains2.size());
2418
2419    // Store the return address to the appropriate stack slot.
2420    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
2421                                     FPDiff, dl);
2422  }
2423
2424  // Build a sequence of copy-to-reg nodes chained together with token chain
2425  // and flag operands which copy the outgoing args into registers.
2426  SDValue InFlag;
2427  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2428    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2429                             RegsToPass[i].second, InFlag);
2430    InFlag = Chain.getValue(1);
2431  }
2432
2433  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
2434    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2435    // In the 64-bit large code model, we have to make all calls
2436    // through a register, since the call instruction's 32-bit
2437    // pc-relative offset may not be large enough to hold the whole
2438    // address.
2439  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2440    // If the callee is a GlobalAddress node (quite common, every direct call
2441    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
2442    // it.
2443
2444    // We should use extra load for direct calls to dllimported functions in
2445    // non-JIT mode.
2446    const GlobalValue *GV = G->getGlobal();
2447    if (!GV->hasDLLImportLinkage()) {
2448      unsigned char OpFlags = 0;
2449      bool ExtraLoad = false;
2450      unsigned WrapperKind = ISD::DELETED_NODE;
2451
2452      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
2453      // external symbols most go through the PLT in PIC mode.  If the symbol
2454      // has hidden or protected visibility, or if it is static or local, then
2455      // we don't need to use the PLT - we can directly call it.
2456      if (Subtarget->isTargetELF() &&
2457          getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
2458          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
2459        OpFlags = X86II::MO_PLT;
2460      } else if (Subtarget->isPICStyleStubAny() &&
2461                 (GV->isDeclaration() || GV->isWeakForLinker()) &&
2462                 (!Subtarget->getTargetTriple().isMacOSX() ||
2463                  Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
2464        // PC-relative references to external symbols should go through $stub,
2465        // unless we're building with the leopard linker or later, which
2466        // automatically synthesizes these stubs.
2467        OpFlags = X86II::MO_DARWIN_STUB;
2468      } else if (Subtarget->isPICStyleRIPRel() &&
2469                 isa<Function>(GV) &&
2470                 cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) {
2471        // If the function is marked as non-lazy, generate an indirect call
2472        // which loads from the GOT directly. This avoids runtime overhead
2473        // at the cost of eager binding (and one extra byte of encoding).
2474        OpFlags = X86II::MO_GOTPCREL;
2475        WrapperKind = X86ISD::WrapperRIP;
2476        ExtraLoad = true;
2477      }
2478
2479      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
2480                                          G->getOffset(), OpFlags);
2481
2482      // Add a wrapper if needed.
2483      if (WrapperKind != ISD::DELETED_NODE)
2484        Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
2485      // Add extra indirection if needed.
2486      if (ExtraLoad)
2487        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
2488                             MachinePointerInfo::getGOT(),
2489                             false, false, false, 0);
2490    }
2491  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2492    unsigned char OpFlags = 0;
2493
2494    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
2495    // external symbols should go through the PLT.
2496    if (Subtarget->isTargetELF() &&
2497        getTargetMachine().getRelocationModel() == Reloc::PIC_) {
2498      OpFlags = X86II::MO_PLT;
2499    } else if (Subtarget->isPICStyleStubAny() &&
2500               (!Subtarget->getTargetTriple().isMacOSX() ||
2501                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
2502      // PC-relative references to external symbols should go through $stub,
2503      // unless we're building with the leopard linker or later, which
2504      // automatically synthesizes these stubs.
2505      OpFlags = X86II::MO_DARWIN_STUB;
2506    }
2507
2508    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
2509                                         OpFlags);
2510  }
2511
2512  // Returns a chain & a flag for retval copy to use.
2513  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2514  SmallVector<SDValue, 8> Ops;
2515
2516  if (!IsSibcall && isTailCall) {
2517    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
2518                           DAG.getIntPtrConstant(0, true), InFlag);
2519    InFlag = Chain.getValue(1);
2520  }
2521
2522  Ops.push_back(Chain);
2523  Ops.push_back(Callee);
2524
2525  if (isTailCall)
2526    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
2527
2528  // Add argument registers to the end of the list so that they are known live
2529  // into the call.
2530  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2531    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2532                                  RegsToPass[i].second.getValueType()));
2533
2534  // Add a register mask operand representing the call-preserved registers.
2535  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
2536  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
2537  assert(Mask && "Missing call preserved mask for calling convention");
2538  Ops.push_back(DAG.getRegisterMask(Mask));
2539
2540  if (InFlag.getNode())
2541    Ops.push_back(InFlag);
2542
2543  if (isTailCall) {
2544    // We used to do:
2545    //// If this is the first return lowered for this function, add the regs
2546    //// to the liveout set for the function.
2547    // This isn't right, although it's probably harmless on x86; liveouts
2548    // should be computed from returns not tail calls.  Consider a void
2549    // function making a tail call to a function returning int.
2550    return DAG.getNode(X86ISD::TC_RETURN, dl,
2551                       NodeTys, &Ops[0], Ops.size());
2552  }
2553
2554  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
2555  InFlag = Chain.getValue(1);
2556
2557  // Create the CALLSEQ_END node.
2558  unsigned NumBytesForCalleeToPush;
2559  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2560                       getTargetMachine().Options.GuaranteedTailCallOpt))
2561    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
2562  else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
2563           SR == StackStructReturn)
2564    // If this is a call to a struct-return function, the callee
2565    // pops the hidden struct pointer, so we have to push it back.
2566    // This is common for Darwin/X86, Linux & Mingw32 targets.
2567    // For MSVC Win32 targets, the caller pops the hidden struct pointer.
2568    NumBytesForCalleeToPush = 4;
2569  else
2570    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
2571
2572  // Returns a flag for retval copy to use.
2573  if (!IsSibcall) {
2574    Chain = DAG.getCALLSEQ_END(Chain,
2575                               DAG.getIntPtrConstant(NumBytes, true),
2576                               DAG.getIntPtrConstant(NumBytesForCalleeToPush,
2577                                                     true),
2578                               InFlag);
2579    InFlag = Chain.getValue(1);
2580  }
2581
2582  // Handle result values, copying them out of physregs into vregs that we
2583  // return.
2584  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
2585                         Ins, dl, DAG, InVals);
2586}
2587
2588
2589//===----------------------------------------------------------------------===//
2590//                Fast Calling Convention (tail call) implementation
2591//===----------------------------------------------------------------------===//
2592
2593//  Like std call, callee cleans arguments, convention except that ECX is
2594//  reserved for storing the tail called function address. Only 2 registers are
2595//  free for argument passing (inreg). Tail call optimization is performed
2596//  provided:
2597//                * tailcallopt is enabled
2598//                * caller/callee are fastcc
2599//  On X86_64 architecture with GOT-style position independent code only local
2600//  (within module) calls are supported at the moment.
2601//  To keep the stack aligned according to platform abi the function
2602//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
2603//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
2604//  If a tail called function callee has more arguments than the caller the
2605//  caller needs to make sure that there is room to move the RETADDR to. This is
2606//  achieved by reserving an area the size of the argument delta right after the
2607//  original REtADDR, but before the saved framepointer or the spilled registers
2608//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2609//  stack layout:
2610//    arg1
2611//    arg2
2612//    RETADDR
2613//    [ new RETADDR
2614//      move area ]
2615//    (possible EBP)
2616//    ESI
2617//    EDI
2618//    local1 ..
2619
2620/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
2621/// for a 16 byte align requirement.
2622unsigned
2623X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
2624                                               SelectionDAG& DAG) const {
2625  MachineFunction &MF = DAG.getMachineFunction();
2626  const TargetMachine &TM = MF.getTarget();
2627  const TargetFrameLowering &TFI = *TM.getFrameLowering();
2628  unsigned StackAlignment = TFI.getStackAlignment();
2629  uint64_t AlignMask = StackAlignment - 1;
2630  int64_t Offset = StackSize;
2631  uint64_t SlotSize = TD->getPointerSize();
2632  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
2633    // Number smaller than 12 so just add the difference.
2634    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
2635  } else {
2636    // Mask out lower bits, add stackalignment once plus the 12 bytes.
2637    Offset = ((~AlignMask) & Offset) + StackAlignment +
2638      (StackAlignment-SlotSize);
2639  }
2640  return Offset;
2641}
2642
2643/// MatchingStackOffset - Return true if the given stack call argument is
2644/// already available in the same position (relatively) of the caller's
2645/// incoming argument stack.
2646static
2647bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2648                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
2649                         const X86InstrInfo *TII) {
2650  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
2651  int FI = INT_MAX;
2652  if (Arg.getOpcode() == ISD::CopyFromReg) {
2653    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2654    if (!TargetRegisterInfo::isVirtualRegister(VR))
2655      return false;
2656    MachineInstr *Def = MRI->getVRegDef(VR);
2657    if (!Def)
2658      return false;
2659    if (!Flags.isByVal()) {
2660      if (!TII->isLoadFromStackSlot(Def, FI))
2661        return false;
2662    } else {
2663      unsigned Opcode = Def->getOpcode();
2664      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
2665          Def->getOperand(1).isFI()) {
2666        FI = Def->getOperand(1).getIndex();
2667        Bytes = Flags.getByValSize();
2668      } else
2669        return false;
2670    }
2671  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2672    if (Flags.isByVal())
2673      // ByVal argument is passed in as a pointer but it's now being
2674      // dereferenced. e.g.
2675      // define @foo(%struct.X* %A) {
2676      //   tail call @bar(%struct.X* byval %A)
2677      // }
2678      return false;
2679    SDValue Ptr = Ld->getBasePtr();
2680    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2681    if (!FINode)
2682      return false;
2683    FI = FINode->getIndex();
2684  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
2685    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
2686    FI = FINode->getIndex();
2687    Bytes = Flags.getByValSize();
2688  } else
2689    return false;
2690
2691  assert(FI != INT_MAX);
2692  if (!MFI->isFixedObjectIndex(FI))
2693    return false;
2694  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
2695}
2696
2697/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2698/// for tail call optimization. Targets which want to do tail call
2699/// optimization should implement this function.
2700bool
2701X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2702                                                     CallingConv::ID CalleeCC,
2703                                                     bool isVarArg,
2704                                                     bool isCalleeStructRet,
2705                                                     bool isCallerStructRet,
2706                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
2707                                    const SmallVectorImpl<SDValue> &OutVals,
2708                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2709                                                     SelectionDAG& DAG) const {
2710  if (!IsTailCallConvention(CalleeCC) &&
2711      CalleeCC != CallingConv::C)
2712    return false;
2713
2714  // If -tailcallopt is specified, make fastcc functions tail-callable.
2715  const MachineFunction &MF = DAG.getMachineFunction();
2716  const Function *CallerF = DAG.getMachineFunction().getFunction();
2717  CallingConv::ID CallerCC = CallerF->getCallingConv();
2718  bool CCMatch = CallerCC == CalleeCC;
2719
2720  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
2721    if (IsTailCallConvention(CalleeCC) && CCMatch)
2722      return true;
2723    return false;
2724  }
2725
2726  // Look for obvious safe cases to perform tail call optimization that do not
2727  // require ABI changes. This is what gcc calls sibcall.
2728
2729  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
2730  // emit a special epilogue.
2731  if (RegInfo->needsStackRealignment(MF))
2732    return false;
2733
2734  // Also avoid sibcall optimization if either caller or callee uses struct
2735  // return semantics.
2736  if (isCalleeStructRet || isCallerStructRet)
2737    return false;
2738
2739  // An stdcall caller is expected to clean up its arguments; the callee
2740  // isn't going to do that.
2741  if (!CCMatch && CallerCC==CallingConv::X86_StdCall)
2742    return false;
2743
2744  // Do not sibcall optimize vararg calls unless all arguments are passed via
2745  // registers.
2746  if (isVarArg && !Outs.empty()) {
2747
2748    // Optimizing for varargs on Win64 is unlikely to be safe without
2749    // additional testing.
2750    if (Subtarget->isTargetWin64())
2751      return false;
2752
2753    SmallVector<CCValAssign, 16> ArgLocs;
2754    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
2755                   getTargetMachine(), ArgLocs, *DAG.getContext());
2756
2757    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2758    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
2759      if (!ArgLocs[i].isRegLoc())
2760        return false;
2761  }
2762
2763  // If the call result is in ST0 / ST1, it needs to be popped off the x87
2764  // stack.  Therefore, if it's not used by the call it is not safe to optimize
2765  // this into a sibcall.
2766  bool Unused = false;
2767  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
2768    if (!Ins[i].Used) {
2769      Unused = true;
2770      break;
2771    }
2772  }
2773  if (Unused) {
2774    SmallVector<CCValAssign, 16> RVLocs;
2775    CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
2776                   getTargetMachine(), RVLocs, *DAG.getContext());
2777    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2778    for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2779      CCValAssign &VA = RVLocs[i];
2780      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
2781        return false;
2782    }
2783  }
2784
2785  // If the calling conventions do not match, then we'd better make sure the
2786  // results are returned in the same way as what the caller expects.
2787  if (!CCMatch) {
2788    SmallVector<CCValAssign, 16> RVLocs1;
2789    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
2790                    getTargetMachine(), RVLocs1, *DAG.getContext());
2791    CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
2792
2793    SmallVector<CCValAssign, 16> RVLocs2;
2794    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
2795                    getTargetMachine(), RVLocs2, *DAG.getContext());
2796    CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
2797
2798    if (RVLocs1.size() != RVLocs2.size())
2799      return false;
2800    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
2801      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
2802        return false;
2803      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
2804        return false;
2805      if (RVLocs1[i].isRegLoc()) {
2806        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
2807          return false;
2808      } else {
2809        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
2810          return false;
2811      }
2812    }
2813  }
2814
2815  // If the callee takes no arguments then go on to check the results of the
2816  // call.
2817  if (!Outs.empty()) {
2818    // Check if stack adjustment is needed. For now, do not do this if any
2819    // argument is passed on the stack.
2820    SmallVector<CCValAssign, 16> ArgLocs;
2821    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
2822                   getTargetMachine(), ArgLocs, *DAG.getContext());
2823
2824    // Allocate shadow area for Win64
2825    if (Subtarget->isTargetWin64()) {
2826      CCInfo.AllocateStack(32, 8);
2827    }
2828
2829    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2830    if (CCInfo.getNextStackOffset()) {
2831      MachineFunction &MF = DAG.getMachineFunction();
2832      if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
2833        return false;
2834
2835      // Check if the arguments are already laid out in the right way as
2836      // the caller's fixed stack objects.
2837      MachineFrameInfo *MFI = MF.getFrameInfo();
2838      const MachineRegisterInfo *MRI = &MF.getRegInfo();
2839      const X86InstrInfo *TII =
2840        ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
2841      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2842        CCValAssign &VA = ArgLocs[i];
2843        SDValue Arg = OutVals[i];
2844        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2845        if (VA.getLocInfo() == CCValAssign::Indirect)
2846          return false;
2847        if (!VA.isRegLoc()) {
2848          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2849                                   MFI, MRI, TII))
2850            return false;
2851        }
2852      }
2853    }
2854
2855    // If the tailcall address may be in a register, then make sure it's
2856    // possible to register allocate for it. In 32-bit, the call address can
2857    // only target EAX, EDX, or ECX since the tail call must be scheduled after
2858    // callee-saved registers are restored. These happen to be the same
2859    // registers used to pass 'inreg' arguments so watch out for those.
2860    if (!Subtarget->is64Bit() &&
2861        !isa<GlobalAddressSDNode>(Callee) &&
2862        !isa<ExternalSymbolSDNode>(Callee)) {
2863      unsigned NumInRegs = 0;
2864      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2865        CCValAssign &VA = ArgLocs[i];
2866        if (!VA.isRegLoc())
2867          continue;
2868        unsigned Reg = VA.getLocReg();
2869        switch (Reg) {
2870        default: break;
2871        case X86::EAX: case X86::EDX: case X86::ECX:
2872          if (++NumInRegs == 3)
2873            return false;
2874          break;
2875        }
2876      }
2877    }
2878  }
2879
2880  return true;
2881}
2882
2883FastISel *
2884X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
2885                                  const TargetLibraryInfo *libInfo) const {
2886  return X86::createFastISel(funcInfo, libInfo);
2887}
2888
2889
2890//===----------------------------------------------------------------------===//
2891//                           Other Lowering Hooks
2892//===----------------------------------------------------------------------===//
2893
2894static bool MayFoldLoad(SDValue Op) {
2895  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
2896}
2897
2898static bool MayFoldIntoStore(SDValue Op) {
2899  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
2900}
2901
2902static bool isTargetShuffle(unsigned Opcode) {
2903  switch(Opcode) {
2904  default: return false;
2905  case X86ISD::PSHUFD:
2906  case X86ISD::PSHUFHW:
2907  case X86ISD::PSHUFLW:
2908  case X86ISD::SHUFP:
2909  case X86ISD::PALIGN:
2910  case X86ISD::MOVLHPS:
2911  case X86ISD::MOVLHPD:
2912  case X86ISD::MOVHLPS:
2913  case X86ISD::MOVLPS:
2914  case X86ISD::MOVLPD:
2915  case X86ISD::MOVSHDUP:
2916  case X86ISD::MOVSLDUP:
2917  case X86ISD::MOVDDUP:
2918  case X86ISD::MOVSS:
2919  case X86ISD::MOVSD:
2920  case X86ISD::UNPCKL:
2921  case X86ISD::UNPCKH:
2922  case X86ISD::VPERMILP:
2923  case X86ISD::VPERM2X128:
2924  case X86ISD::VPERMI:
2925    return true;
2926  }
2927}
2928
2929static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
2930                                    SDValue V1, SelectionDAG &DAG) {
2931  switch(Opc) {
2932  default: llvm_unreachable("Unknown x86 shuffle node");
2933  case X86ISD::MOVSHDUP:
2934  case X86ISD::MOVSLDUP:
2935  case X86ISD::MOVDDUP:
2936    return DAG.getNode(Opc, dl, VT, V1);
2937  }
2938}
2939
2940static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
2941                                    SDValue V1, unsigned TargetMask,
2942                                    SelectionDAG &DAG) {
2943  switch(Opc) {
2944  default: llvm_unreachable("Unknown x86 shuffle node");
2945  case X86ISD::PSHUFD:
2946  case X86ISD::PSHUFHW:
2947  case X86ISD::PSHUFLW:
2948  case X86ISD::VPERMILP:
2949  case X86ISD::VPERMI:
2950    return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
2951  }
2952}
2953
2954static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
2955                                    SDValue V1, SDValue V2, unsigned TargetMask,
2956                                    SelectionDAG &DAG) {
2957  switch(Opc) {
2958  default: llvm_unreachable("Unknown x86 shuffle node");
2959  case X86ISD::PALIGN:
2960  case X86ISD::SHUFP:
2961  case X86ISD::VPERM2X128:
2962    return DAG.getNode(Opc, dl, VT, V1, V2,
2963                       DAG.getConstant(TargetMask, MVT::i8));
2964  }
2965}
2966
2967static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
2968                                    SDValue V1, SDValue V2, SelectionDAG &DAG) {
2969  switch(Opc) {
2970  default: llvm_unreachable("Unknown x86 shuffle node");
2971  case X86ISD::MOVLHPS:
2972  case X86ISD::MOVLHPD:
2973  case X86ISD::MOVHLPS:
2974  case X86ISD::MOVLPS:
2975  case X86ISD::MOVLPD:
2976  case X86ISD::MOVSS:
2977  case X86ISD::MOVSD:
2978  case X86ISD::UNPCKL:
2979  case X86ISD::UNPCKH:
2980    return DAG.getNode(Opc, dl, VT, V1, V2);
2981  }
2982}
2983
2984SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
2985  MachineFunction &MF = DAG.getMachineFunction();
2986  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2987  int ReturnAddrIndex = FuncInfo->getRAIndex();
2988
2989  if (ReturnAddrIndex == 0) {
2990    // Set up a frame object for the return address.
2991    uint64_t SlotSize = TD->getPointerSize();
2992    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
2993                                                           false);
2994    FuncInfo->setRAIndex(ReturnAddrIndex);
2995  }
2996
2997  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
2998}
2999
3000
3001bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3002                                       bool hasSymbolicDisplacement) {
3003  // Offset should fit into 32 bit immediate field.
3004  if (!isInt<32>(Offset))
3005    return false;
3006
3007  // If we don't have a symbolic displacement - we don't have any extra
3008  // restrictions.
3009  if (!hasSymbolicDisplacement)
3010    return true;
3011
3012  // FIXME: Some tweaks might be needed for medium code model.
3013  if (M != CodeModel::Small && M != CodeModel::Kernel)
3014    return false;
3015
3016  // For small code model we assume that latest object is 16MB before end of 31
3017  // bits boundary. We may also accept pretty large negative constants knowing
3018  // that all objects are in the positive half of address space.
3019  if (M == CodeModel::Small && Offset < 16*1024*1024)
3020    return true;
3021
3022  // For kernel code model we know that all object resist in the negative half
3023  // of 32bits address space. We may not accept negative offsets, since they may
3024  // be just off and we may accept pretty large positive ones.
3025  if (M == CodeModel::Kernel && Offset > 0)
3026    return true;
3027
3028  return false;
3029}
3030
3031/// isCalleePop - Determines whether the callee is required to pop its
3032/// own arguments. Callee pop is necessary to support tail calls.
3033bool X86::isCalleePop(CallingConv::ID CallingConv,
3034                      bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3035  if (IsVarArg)
3036    return false;
3037
3038  switch (CallingConv) {
3039  default:
3040    return false;
3041  case CallingConv::X86_StdCall:
3042    return !is64Bit;
3043  case CallingConv::X86_FastCall:
3044    return !is64Bit;
3045  case CallingConv::X86_ThisCall:
3046    return !is64Bit;
3047  case CallingConv::Fast:
3048    return TailCallOpt;
3049  case CallingConv::GHC:
3050    return TailCallOpt;
3051  }
3052}
3053
3054/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3055/// specific condition code, returning the condition code and the LHS/RHS of the
3056/// comparison to make.
3057static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3058                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3059  if (!isFP) {
3060    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3061      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3062        // X > -1   -> X == 0, jump !sign.
3063        RHS = DAG.getConstant(0, RHS.getValueType());
3064        return X86::COND_NS;
3065      }
3066      if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3067        // X < 0   -> X == 0, jump on sign.
3068        return X86::COND_S;
3069      }
3070      if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3071        // X < 1   -> X <= 0
3072        RHS = DAG.getConstant(0, RHS.getValueType());
3073        return X86::COND_LE;
3074      }
3075    }
3076
3077    switch (SetCCOpcode) {
3078    default: llvm_unreachable("Invalid integer condition!");
3079    case ISD::SETEQ:  return X86::COND_E;
3080    case ISD::SETGT:  return X86::COND_G;
3081    case ISD::SETGE:  return X86::COND_GE;
3082    case ISD::SETLT:  return X86::COND_L;
3083    case ISD::SETLE:  return X86::COND_LE;
3084    case ISD::SETNE:  return X86::COND_NE;
3085    case ISD::SETULT: return X86::COND_B;
3086    case ISD::SETUGT: return X86::COND_A;
3087    case ISD::SETULE: return X86::COND_BE;
3088    case ISD::SETUGE: return X86::COND_AE;
3089    }
3090  }
3091
3092  // First determine if it is required or is profitable to flip the operands.
3093
3094  // If LHS is a foldable load, but RHS is not, flip the condition.
3095  if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3096      !ISD::isNON_EXTLoad(RHS.getNode())) {
3097    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3098    std::swap(LHS, RHS);
3099  }
3100
3101  switch (SetCCOpcode) {
3102  default: break;
3103  case ISD::SETOLT:
3104  case ISD::SETOLE:
3105  case ISD::SETUGT:
3106  case ISD::SETUGE:
3107    std::swap(LHS, RHS);
3108    break;
3109  }
3110
3111  // On a floating point condition, the flags are set as follows:
3112  // ZF  PF  CF   op
3113  //  0 | 0 | 0 | X > Y
3114  //  0 | 0 | 1 | X < Y
3115  //  1 | 0 | 0 | X == Y
3116  //  1 | 1 | 1 | unordered
3117  switch (SetCCOpcode) {
3118  default: llvm_unreachable("Condcode should be pre-legalized away");
3119  case ISD::SETUEQ:
3120  case ISD::SETEQ:   return X86::COND_E;
3121  case ISD::SETOLT:              // flipped
3122  case ISD::SETOGT:
3123  case ISD::SETGT:   return X86::COND_A;
3124  case ISD::SETOLE:              // flipped
3125  case ISD::SETOGE:
3126  case ISD::SETGE:   return X86::COND_AE;
3127  case ISD::SETUGT:              // flipped
3128  case ISD::SETULT:
3129  case ISD::SETLT:   return X86::COND_B;
3130  case ISD::SETUGE:              // flipped
3131  case ISD::SETULE:
3132  case ISD::SETLE:   return X86::COND_BE;
3133  case ISD::SETONE:
3134  case ISD::SETNE:   return X86::COND_NE;
3135  case ISD::SETUO:   return X86::COND_P;
3136  case ISD::SETO:    return X86::COND_NP;
3137  case ISD::SETOEQ:
3138  case ISD::SETUNE:  return X86::COND_INVALID;
3139  }
3140}
3141
3142/// hasFPCMov - is there a floating point cmov for the specific X86 condition
3143/// code. Current x86 isa includes the following FP cmov instructions:
3144/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3145static bool hasFPCMov(unsigned X86CC) {
3146  switch (X86CC) {
3147  default:
3148    return false;
3149  case X86::COND_B:
3150  case X86::COND_BE:
3151  case X86::COND_E:
3152  case X86::COND_P:
3153  case X86::COND_A:
3154  case X86::COND_AE:
3155  case X86::COND_NE:
3156  case X86::COND_NP:
3157    return true;
3158  }
3159}
3160
3161/// isFPImmLegal - Returns true if the target can instruction select the
3162/// specified FP immediate natively. If false, the legalizer will
3163/// materialize the FP immediate as a load from a constant pool.
3164bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3165  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3166    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3167      return true;
3168  }
3169  return false;
3170}
3171
3172/// isUndefOrInRange - Return true if Val is undef or if its value falls within
3173/// the specified range (L, H].
3174static bool isUndefOrInRange(int Val, int Low, int Hi) {
3175  return (Val < 0) || (Val >= Low && Val < Hi);
3176}
3177
3178/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3179/// specified value.
3180static bool isUndefOrEqual(int Val, int CmpVal) {
3181  if (Val < 0 || Val == CmpVal)
3182    return true;
3183  return false;
3184}
3185
3186/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3187/// from position Pos and ending in Pos+Size, falls within the specified
3188/// sequential range (L, L+Pos]. or is undef.
3189static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3190                                       unsigned Pos, unsigned Size, int Low) {
3191  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3192    if (!isUndefOrEqual(Mask[i], Low))
3193      return false;
3194  return true;
3195}
3196
3197/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3198/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
3199/// the second operand.
3200static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
3201  if (VT == MVT::v4f32 || VT == MVT::v4i32 )
3202    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
3203  if (VT == MVT::v2f64 || VT == MVT::v2i64)
3204    return (Mask[0] < 2 && Mask[1] < 2);
3205  return false;
3206}
3207
3208/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3209/// is suitable for input to PSHUFHW.
3210static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
3211  if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16))
3212    return false;
3213
3214  // Lower quadword copied in order or undef.
3215  if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3216    return false;
3217
3218  // Upper quadword shuffled.
3219  for (unsigned i = 4; i != 8; ++i)
3220    if (!isUndefOrInRange(Mask[i], 4, 8))
3221      return false;
3222
3223  if (VT == MVT::v16i16) {
3224    // Lower quadword copied in order or undef.
3225    if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3226      return false;
3227
3228    // Upper quadword shuffled.
3229    for (unsigned i = 12; i != 16; ++i)
3230      if (!isUndefOrInRange(Mask[i], 12, 16))
3231        return false;
3232  }
3233
3234  return true;
3235}
3236
3237/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
3238/// is suitable for input to PSHUFLW.
3239static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
3240  if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16))
3241    return false;
3242
3243  // Upper quadword copied in order.
3244  if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
3245    return false;
3246
3247  // Lower quadword shuffled.
3248  for (unsigned i = 0; i != 4; ++i)
3249    if (!isUndefOrInRange(Mask[i], 0, 4))
3250      return false;
3251
3252  if (VT == MVT::v16i16) {
3253    // Upper quadword copied in order.
3254    if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
3255      return false;
3256
3257    // Lower quadword shuffled.
3258    for (unsigned i = 8; i != 12; ++i)
3259      if (!isUndefOrInRange(Mask[i], 8, 12))
3260        return false;
3261  }
3262
3263  return true;
3264}
3265
3266/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
3267/// is suitable for input to PALIGNR.
3268static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
3269                          const X86Subtarget *Subtarget) {
3270  if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) ||
3271      (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()))
3272    return false;
3273
3274  unsigned NumElts = VT.getVectorNumElements();
3275  unsigned NumLanes = VT.getSizeInBits()/128;
3276  unsigned NumLaneElts = NumElts/NumLanes;
3277
3278  // Do not handle 64-bit element shuffles with palignr.
3279  if (NumLaneElts == 2)
3280    return false;
3281
3282  for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
3283    unsigned i;
3284    for (i = 0; i != NumLaneElts; ++i) {
3285      if (Mask[i+l] >= 0)
3286        break;
3287    }
3288
3289    // Lane is all undef, go to next lane
3290    if (i == NumLaneElts)
3291      continue;
3292
3293    int Start = Mask[i+l];
3294
3295    // Make sure its in this lane in one of the sources
3296    if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
3297        !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
3298      return false;
3299
3300    // If not lane 0, then we must match lane 0
3301    if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
3302      return false;
3303
3304    // Correct second source to be contiguous with first source
3305    if (Start >= (int)NumElts)
3306      Start -= NumElts - NumLaneElts;
3307
3308    // Make sure we're shifting in the right direction.
3309    if (Start <= (int)(i+l))
3310      return false;
3311
3312    Start -= i;
3313
3314    // Check the rest of the elements to see if they are consecutive.
3315    for (++i; i != NumLaneElts; ++i) {
3316      int Idx = Mask[i+l];
3317
3318      // Make sure its in this lane
3319      if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
3320          !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
3321        return false;
3322
3323      // If not lane 0, then we must match lane 0
3324      if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
3325        return false;
3326
3327      if (Idx >= (int)NumElts)
3328        Idx -= NumElts - NumLaneElts;
3329
3330      if (!isUndefOrEqual(Idx, Start+i))
3331        return false;
3332
3333    }
3334  }
3335
3336  return true;
3337}
3338
3339/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
3340/// the two vector operands have swapped position.
3341static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
3342                                     unsigned NumElems) {
3343  for (unsigned i = 0; i != NumElems; ++i) {
3344    int idx = Mask[i];
3345    if (idx < 0)
3346      continue;
3347    else if (idx < (int)NumElems)
3348      Mask[i] = idx + NumElems;
3349    else
3350      Mask[i] = idx - NumElems;
3351  }
3352}
3353
3354/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
3355/// specifies a shuffle of elements that is suitable for input to 128/256-bit
3356/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
3357/// reverse of what x86 shuffles want.
3358static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX,
3359                        bool Commuted = false) {
3360  if (!HasAVX && VT.getSizeInBits() == 256)
3361    return false;
3362
3363  unsigned NumElems = VT.getVectorNumElements();
3364  unsigned NumLanes = VT.getSizeInBits()/128;
3365  unsigned NumLaneElems = NumElems/NumLanes;
3366
3367  if (NumLaneElems != 2 && NumLaneElems != 4)
3368    return false;
3369
3370  // VSHUFPSY divides the resulting vector into 4 chunks.
3371  // The sources are also splitted into 4 chunks, and each destination
3372  // chunk must come from a different source chunk.
3373  //
3374  //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
3375  //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
3376  //
3377  //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
3378  //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
3379  //
3380  // VSHUFPDY divides the resulting vector into 4 chunks.
3381  // The sources are also splitted into 4 chunks, and each destination
3382  // chunk must come from a different source chunk.
3383  //
3384  //  SRC1 =>      X3       X2       X1       X0
3385  //  SRC2 =>      Y3       Y2       Y1       Y0
3386  //
3387  //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
3388  //
3389  unsigned HalfLaneElems = NumLaneElems/2;
3390  for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
3391    for (unsigned i = 0; i != NumLaneElems; ++i) {
3392      int Idx = Mask[i+l];
3393      unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
3394      if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
3395        return false;
3396      // For VSHUFPSY, the mask of the second half must be the same as the
3397      // first but with the appropriate offsets. This works in the same way as
3398      // VPERMILPS works with masks.
3399      if (NumElems != 8 || l == 0 || Mask[i] < 0)
3400        continue;
3401      if (!isUndefOrEqual(Idx, Mask[i]+l))
3402        return false;
3403    }
3404  }
3405
3406  return true;
3407}
3408
3409/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
3410/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
3411static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
3412  if (!VT.is128BitVector())
3413    return false;
3414
3415  unsigned NumElems = VT.getVectorNumElements();
3416
3417  if (NumElems != 4)
3418    return false;
3419
3420  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
3421  return isUndefOrEqual(Mask[0], 6) &&
3422         isUndefOrEqual(Mask[1], 7) &&
3423         isUndefOrEqual(Mask[2], 2) &&
3424         isUndefOrEqual(Mask[3], 3);
3425}
3426
3427/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
3428/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
3429/// <2, 3, 2, 3>
3430static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
3431  if (!VT.is128BitVector())
3432    return false;
3433
3434  unsigned NumElems = VT.getVectorNumElements();
3435
3436  if (NumElems != 4)
3437    return false;
3438
3439  return isUndefOrEqual(Mask[0], 2) &&
3440         isUndefOrEqual(Mask[1], 3) &&
3441         isUndefOrEqual(Mask[2], 2) &&
3442         isUndefOrEqual(Mask[3], 3);
3443}
3444
3445/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
3446/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
3447static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
3448  if (!VT.is128BitVector())
3449    return false;
3450
3451  unsigned NumElems = VT.getVectorNumElements();
3452
3453  if (NumElems != 2 && NumElems != 4)
3454    return false;
3455
3456  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3457    if (!isUndefOrEqual(Mask[i], i + NumElems))
3458      return false;
3459
3460  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
3461    if (!isUndefOrEqual(Mask[i], i))
3462      return false;
3463
3464  return true;
3465}
3466
3467/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
3468/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
3469static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
3470  if (!VT.is128BitVector())
3471    return false;
3472
3473  unsigned NumElems = VT.getVectorNumElements();
3474
3475  if (NumElems != 2 && NumElems != 4)
3476    return false;
3477
3478  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3479    if (!isUndefOrEqual(Mask[i], i))
3480      return false;
3481
3482  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3483    if (!isUndefOrEqual(Mask[i + e], i + NumElems))
3484      return false;
3485
3486  return true;
3487}
3488
3489//
3490// Some special combinations that can be optimized.
3491//
3492static
3493SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
3494                               SelectionDAG &DAG) {
3495  EVT VT = SVOp->getValueType(0);
3496  DebugLoc dl = SVOp->getDebugLoc();
3497
3498  if (VT != MVT::v8i32 && VT != MVT::v8f32)
3499    return SDValue();
3500
3501  ArrayRef<int> Mask = SVOp->getMask();
3502
3503  // These are the special masks that may be optimized.
3504  static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
3505  static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
3506  bool MatchEvenMask = true;
3507  bool MatchOddMask  = true;
3508  for (int i=0; i<8; ++i) {
3509    if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
3510      MatchEvenMask = false;
3511    if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
3512      MatchOddMask = false;
3513  }
3514
3515  if (!MatchEvenMask && !MatchOddMask)
3516    return SDValue();
3517
3518  SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
3519
3520  SDValue Op0 = SVOp->getOperand(0);
3521  SDValue Op1 = SVOp->getOperand(1);
3522
3523  if (MatchEvenMask) {
3524    // Shift the second operand right to 32 bits.
3525    static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
3526    Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
3527  } else {
3528    // Shift the first operand left to 32 bits.
3529    static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
3530    Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
3531  }
3532  static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
3533  return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
3534}
3535
3536/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
3537/// specifies a shuffle of elements that is suitable for input to UNPCKL.
3538static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
3539                         bool HasAVX2, bool V2IsSplat = false) {
3540  unsigned NumElts = VT.getVectorNumElements();
3541
3542  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3543         "Unsupported vector type for unpckh");
3544
3545  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
3546      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
3547    return false;
3548
3549  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3550  // independently on 128-bit lanes.
3551  unsigned NumLanes = VT.getSizeInBits()/128;
3552  unsigned NumLaneElts = NumElts/NumLanes;
3553
3554  for (unsigned l = 0; l != NumLanes; ++l) {
3555    for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
3556         i != (l+1)*NumLaneElts;
3557         i += 2, ++j) {
3558      int BitI  = Mask[i];
3559      int BitI1 = Mask[i+1];
3560      if (!isUndefOrEqual(BitI, j))
3561        return false;
3562      if (V2IsSplat) {
3563        if (!isUndefOrEqual(BitI1, NumElts))
3564          return false;
3565      } else {
3566        if (!isUndefOrEqual(BitI1, j + NumElts))
3567          return false;
3568      }
3569    }
3570  }
3571
3572  return true;
3573}
3574
3575/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
3576/// specifies a shuffle of elements that is suitable for input to UNPCKH.
3577static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
3578                         bool HasAVX2, bool V2IsSplat = false) {
3579  unsigned NumElts = VT.getVectorNumElements();
3580
3581  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3582         "Unsupported vector type for unpckh");
3583
3584  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
3585      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
3586    return false;
3587
3588  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3589  // independently on 128-bit lanes.
3590  unsigned NumLanes = VT.getSizeInBits()/128;
3591  unsigned NumLaneElts = NumElts/NumLanes;
3592
3593  for (unsigned l = 0; l != NumLanes; ++l) {
3594    for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
3595         i != (l+1)*NumLaneElts; i += 2, ++j) {
3596      int BitI  = Mask[i];
3597      int BitI1 = Mask[i+1];
3598      if (!isUndefOrEqual(BitI, j))
3599        return false;
3600      if (V2IsSplat) {
3601        if (isUndefOrEqual(BitI1, NumElts))
3602          return false;
3603      } else {
3604        if (!isUndefOrEqual(BitI1, j+NumElts))
3605          return false;
3606      }
3607    }
3608  }
3609  return true;
3610}
3611
3612/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
3613/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
3614/// <0, 0, 1, 1>
3615static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT,
3616                                  bool HasAVX2) {
3617  unsigned NumElts = VT.getVectorNumElements();
3618
3619  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3620         "Unsupported vector type for unpckh");
3621
3622  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
3623      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
3624    return false;
3625
3626  // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
3627  // FIXME: Need a better way to get rid of this, there's no latency difference
3628  // between UNPCKLPD and MOVDDUP, the later should always be checked first and
3629  // the former later. We should also remove the "_undef" special mask.
3630  if (NumElts == 4 && VT.getSizeInBits() == 256)
3631    return false;
3632
3633  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3634  // independently on 128-bit lanes.
3635  unsigned NumLanes = VT.getSizeInBits()/128;
3636  unsigned NumLaneElts = NumElts/NumLanes;
3637
3638  for (unsigned l = 0; l != NumLanes; ++l) {
3639    for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
3640         i != (l+1)*NumLaneElts;
3641         i += 2, ++j) {
3642      int BitI  = Mask[i];
3643      int BitI1 = Mask[i+1];
3644
3645      if (!isUndefOrEqual(BitI, j))
3646        return false;
3647      if (!isUndefOrEqual(BitI1, j))
3648        return false;
3649    }
3650  }
3651
3652  return true;
3653}
3654
3655/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
3656/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
3657/// <2, 2, 3, 3>
3658static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
3659  unsigned NumElts = VT.getVectorNumElements();
3660
3661  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3662         "Unsupported vector type for unpckh");
3663
3664  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
3665      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
3666    return false;
3667
3668  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
3669  // independently on 128-bit lanes.
3670  unsigned NumLanes = VT.getSizeInBits()/128;
3671  unsigned NumLaneElts = NumElts/NumLanes;
3672
3673  for (unsigned l = 0; l != NumLanes; ++l) {
3674    for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
3675         i != (l+1)*NumLaneElts; i += 2, ++j) {
3676      int BitI  = Mask[i];
3677      int BitI1 = Mask[i+1];
3678      if (!isUndefOrEqual(BitI, j))
3679        return false;
3680      if (!isUndefOrEqual(BitI1, j))
3681        return false;
3682    }
3683  }
3684  return true;
3685}
3686
3687/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
3688/// specifies a shuffle of elements that is suitable for input to MOVSS,
3689/// MOVSD, and MOVD, i.e. setting the lowest element.
3690static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
3691  if (VT.getVectorElementType().getSizeInBits() < 32)
3692    return false;
3693  if (!VT.is128BitVector())
3694    return false;
3695
3696  unsigned NumElts = VT.getVectorNumElements();
3697
3698  if (!isUndefOrEqual(Mask[0], NumElts))
3699    return false;
3700
3701  for (unsigned i = 1; i != NumElts; ++i)
3702    if (!isUndefOrEqual(Mask[i], i))
3703      return false;
3704
3705  return true;
3706}
3707
3708/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
3709/// as permutations between 128-bit chunks or halves. As an example: this
3710/// shuffle bellow:
3711///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
3712/// The first half comes from the second half of V1 and the second half from the
3713/// the second half of V2.
3714static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
3715  if (!HasAVX || !VT.is256BitVector())
3716    return false;
3717
3718  // The shuffle result is divided into half A and half B. In total the two
3719  // sources have 4 halves, namely: C, D, E, F. The final values of A and
3720  // B must come from C, D, E or F.
3721  unsigned HalfSize = VT.getVectorNumElements()/2;
3722  bool MatchA = false, MatchB = false;
3723
3724  // Check if A comes from one of C, D, E, F.
3725  for (unsigned Half = 0; Half != 4; ++Half) {
3726    if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
3727      MatchA = true;
3728      break;
3729    }
3730  }
3731
3732  // Check if B comes from one of C, D, E, F.
3733  for (unsigned Half = 0; Half != 4; ++Half) {
3734    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
3735      MatchB = true;
3736      break;
3737    }
3738  }
3739
3740  return MatchA && MatchB;
3741}
3742
3743/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
3744/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
3745static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
3746  EVT VT = SVOp->getValueType(0);
3747
3748  unsigned HalfSize = VT.getVectorNumElements()/2;
3749
3750  unsigned FstHalf = 0, SndHalf = 0;
3751  for (unsigned i = 0; i < HalfSize; ++i) {
3752    if (SVOp->getMaskElt(i) > 0) {
3753      FstHalf = SVOp->getMaskElt(i)/HalfSize;
3754      break;
3755    }
3756  }
3757  for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
3758    if (SVOp->getMaskElt(i) > 0) {
3759      SndHalf = SVOp->getMaskElt(i)/HalfSize;
3760      break;
3761    }
3762  }
3763
3764  return (FstHalf | (SndHalf << 4));
3765}
3766
3767/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
3768/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
3769/// Note that VPERMIL mask matching is different depending whether theunderlying
3770/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
3771/// to the same elements of the low, but to the higher half of the source.
3772/// In VPERMILPD the two lanes could be shuffled independently of each other
3773/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
3774static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
3775  if (!HasAVX)
3776    return false;
3777
3778  unsigned NumElts = VT.getVectorNumElements();
3779  // Only match 256-bit with 32/64-bit types
3780  if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8))
3781    return false;
3782
3783  unsigned NumLanes = VT.getSizeInBits()/128;
3784  unsigned LaneSize = NumElts/NumLanes;
3785  for (unsigned l = 0; l != NumElts; l += LaneSize) {
3786    for (unsigned i = 0; i != LaneSize; ++i) {
3787      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
3788        return false;
3789      if (NumElts != 8 || l == 0)
3790        continue;
3791      // VPERMILPS handling
3792      if (Mask[i] < 0)
3793        continue;
3794      if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
3795        return false;
3796    }
3797  }
3798
3799  return true;
3800}
3801
3802/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
3803/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
3804/// element of vector 2 and the other elements to come from vector 1 in order.
3805static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
3806                               bool V2IsSplat = false, bool V2IsUndef = false) {
3807  if (!VT.is128BitVector())
3808    return false;
3809
3810  unsigned NumOps = VT.getVectorNumElements();
3811  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
3812    return false;
3813
3814  if (!isUndefOrEqual(Mask[0], 0))
3815    return false;
3816
3817  for (unsigned i = 1; i != NumOps; ++i)
3818    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
3819          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
3820          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
3821      return false;
3822
3823  return true;
3824}
3825
3826/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3827/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
3828/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
3829static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
3830                           const X86Subtarget *Subtarget) {
3831  if (!Subtarget->hasSSE3())
3832    return false;
3833
3834  unsigned NumElems = VT.getVectorNumElements();
3835
3836  if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
3837      (VT.getSizeInBits() == 256 && NumElems != 8))
3838    return false;
3839
3840  // "i+1" is the value the indexed mask element must have
3841  for (unsigned i = 0; i != NumElems; i += 2)
3842    if (!isUndefOrEqual(Mask[i], i+1) ||
3843        !isUndefOrEqual(Mask[i+1], i+1))
3844      return false;
3845
3846  return true;
3847}
3848
3849/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3850/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
3851/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
3852static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
3853                           const X86Subtarget *Subtarget) {
3854  if (!Subtarget->hasSSE3())
3855    return false;
3856
3857  unsigned NumElems = VT.getVectorNumElements();
3858
3859  if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
3860      (VT.getSizeInBits() == 256 && NumElems != 8))
3861    return false;
3862
3863  // "i" is the value the indexed mask element must have
3864  for (unsigned i = 0; i != NumElems; i += 2)
3865    if (!isUndefOrEqual(Mask[i], i) ||
3866        !isUndefOrEqual(Mask[i+1], i))
3867      return false;
3868
3869  return true;
3870}
3871
3872/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
3873/// specifies a shuffle of elements that is suitable for input to 256-bit
3874/// version of MOVDDUP.
3875static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
3876  if (!HasAVX || !VT.is256BitVector())
3877    return false;
3878
3879  unsigned NumElts = VT.getVectorNumElements();
3880  if (NumElts != 4)
3881    return false;
3882
3883  for (unsigned i = 0; i != NumElts/2; ++i)
3884    if (!isUndefOrEqual(Mask[i], 0))
3885      return false;
3886  for (unsigned i = NumElts/2; i != NumElts; ++i)
3887    if (!isUndefOrEqual(Mask[i], NumElts/2))
3888      return false;
3889  return true;
3890}
3891
3892/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3893/// specifies a shuffle of elements that is suitable for input to 128-bit
3894/// version of MOVDDUP.
3895static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
3896  if (!VT.is128BitVector())
3897    return false;
3898
3899  unsigned e = VT.getVectorNumElements() / 2;
3900  for (unsigned i = 0; i != e; ++i)
3901    if (!isUndefOrEqual(Mask[i], i))
3902      return false;
3903  for (unsigned i = 0; i != e; ++i)
3904    if (!isUndefOrEqual(Mask[e+i], i))
3905      return false;
3906  return true;
3907}
3908
3909/// isVEXTRACTF128Index - Return true if the specified
3910/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
3911/// suitable for input to VEXTRACTF128.
3912bool X86::isVEXTRACTF128Index(SDNode *N) {
3913  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
3914    return false;
3915
3916  // The index should be aligned on a 128-bit boundary.
3917  uint64_t Index =
3918    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
3919
3920  unsigned VL = N->getValueType(0).getVectorNumElements();
3921  unsigned VBits = N->getValueType(0).getSizeInBits();
3922  unsigned ElSize = VBits / VL;
3923  bool Result = (Index * ElSize) % 128 == 0;
3924
3925  return Result;
3926}
3927
3928/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR
3929/// operand specifies a subvector insert that is suitable for input to
3930/// VINSERTF128.
3931bool X86::isVINSERTF128Index(SDNode *N) {
3932  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
3933    return false;
3934
3935  // The index should be aligned on a 128-bit boundary.
3936  uint64_t Index =
3937    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
3938
3939  unsigned VL = N->getValueType(0).getVectorNumElements();
3940  unsigned VBits = N->getValueType(0).getSizeInBits();
3941  unsigned ElSize = VBits / VL;
3942  bool Result = (Index * ElSize) % 128 == 0;
3943
3944  return Result;
3945}
3946
3947/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
3948/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
3949/// Handles 128-bit and 256-bit.
3950static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
3951  EVT VT = N->getValueType(0);
3952
3953  assert((VT.is128BitVector() || VT.is256BitVector()) &&
3954         "Unsupported vector type for PSHUF/SHUFP");
3955
3956  // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
3957  // independently on 128-bit lanes.
3958  unsigned NumElts = VT.getVectorNumElements();
3959  unsigned NumLanes = VT.getSizeInBits()/128;
3960  unsigned NumLaneElts = NumElts/NumLanes;
3961
3962  assert((NumLaneElts == 2 || NumLaneElts == 4) &&
3963         "Only supports 2 or 4 elements per lane");
3964
3965  unsigned Shift = (NumLaneElts == 4) ? 1 : 0;
3966  unsigned Mask = 0;
3967  for (unsigned i = 0; i != NumElts; ++i) {
3968    int Elt = N->getMaskElt(i);
3969    if (Elt < 0) continue;
3970    Elt &= NumLaneElts - 1;
3971    unsigned ShAmt = (i << Shift) % 8;
3972    Mask |= Elt << ShAmt;
3973  }
3974
3975  return Mask;
3976}
3977
3978/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
3979/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
3980static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
3981  EVT VT = N->getValueType(0);
3982
3983  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
3984         "Unsupported vector type for PSHUFHW");
3985
3986  unsigned NumElts = VT.getVectorNumElements();
3987
3988  unsigned Mask = 0;
3989  for (unsigned l = 0; l != NumElts; l += 8) {
3990    // 8 nodes per lane, but we only care about the last 4.
3991    for (unsigned i = 0; i < 4; ++i) {
3992      int Elt = N->getMaskElt(l+i+4);
3993      if (Elt < 0) continue;
3994      Elt &= 0x3; // only 2-bits.
3995      Mask |= Elt << (i * 2);
3996    }
3997  }
3998
3999  return Mask;
4000}
4001
4002/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
4003/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
4004static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
4005  EVT VT = N->getValueType(0);
4006
4007  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4008         "Unsupported vector type for PSHUFHW");
4009
4010  unsigned NumElts = VT.getVectorNumElements();
4011
4012  unsigned Mask = 0;
4013  for (unsigned l = 0; l != NumElts; l += 8) {
4014    // 8 nodes per lane, but we only care about the first 4.
4015    for (unsigned i = 0; i < 4; ++i) {
4016      int Elt = N->getMaskElt(l+i);
4017      if (Elt < 0) continue;
4018      Elt &= 0x3; // only 2-bits
4019      Mask |= Elt << (i * 2);
4020    }
4021  }
4022
4023  return Mask;
4024}
4025
4026/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
4027/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
4028static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
4029  EVT VT = SVOp->getValueType(0);
4030  unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
4031
4032  unsigned NumElts = VT.getVectorNumElements();
4033  unsigned NumLanes = VT.getSizeInBits()/128;
4034  unsigned NumLaneElts = NumElts/NumLanes;
4035
4036  int Val = 0;
4037  unsigned i;
4038  for (i = 0; i != NumElts; ++i) {
4039    Val = SVOp->getMaskElt(i);
4040    if (Val >= 0)
4041      break;
4042  }
4043  if (Val >= (int)NumElts)
4044    Val -= NumElts - NumLaneElts;
4045
4046  assert(Val - i > 0 && "PALIGNR imm should be positive");
4047  return (Val - i) * EltSize;
4048}
4049
4050/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate
4051/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
4052/// instructions.
4053unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
4054  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4055    llvm_unreachable("Illegal extract subvector for VEXTRACTF128");
4056
4057  uint64_t Index =
4058    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4059
4060  EVT VecVT = N->getOperand(0).getValueType();
4061  EVT ElVT = VecVT.getVectorElementType();
4062
4063  unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
4064  return Index / NumElemsPerChunk;
4065}
4066
4067/// getInsertVINSERTF128Immediate - Return the appropriate immediate
4068/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
4069/// instructions.
4070unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
4071  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4072    llvm_unreachable("Illegal insert subvector for VINSERTF128");
4073
4074  uint64_t Index =
4075    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4076
4077  EVT VecVT = N->getValueType(0);
4078  EVT ElVT = VecVT.getVectorElementType();
4079
4080  unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
4081  return Index / NumElemsPerChunk;
4082}
4083
4084/// getShuffleCLImmediate - Return the appropriate immediate to shuffle
4085/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
4086/// Handles 256-bit.
4087static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
4088  EVT VT = N->getValueType(0);
4089
4090  unsigned NumElts = VT.getVectorNumElements();
4091
4092  assert((VT.is256BitVector() && NumElts == 4) &&
4093         "Unsupported vector type for VPERMQ/VPERMPD");
4094
4095  unsigned Mask = 0;
4096  for (unsigned i = 0; i != NumElts; ++i) {
4097    int Elt = N->getMaskElt(i);
4098    if (Elt < 0)
4099      continue;
4100    Mask |= Elt << (i*2);
4101  }
4102
4103  return Mask;
4104}
4105/// isZeroNode - Returns true if Elt is a constant zero or a floating point
4106/// constant +0.0.
4107bool X86::isZeroNode(SDValue Elt) {
4108  return ((isa<ConstantSDNode>(Elt) &&
4109           cast<ConstantSDNode>(Elt)->isNullValue()) ||
4110          (isa<ConstantFPSDNode>(Elt) &&
4111           cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
4112}
4113
4114/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
4115/// their permute mask.
4116static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
4117                                    SelectionDAG &DAG) {
4118  EVT VT = SVOp->getValueType(0);
4119  unsigned NumElems = VT.getVectorNumElements();
4120  SmallVector<int, 8> MaskVec;
4121
4122  for (unsigned i = 0; i != NumElems; ++i) {
4123    int Idx = SVOp->getMaskElt(i);
4124    if (Idx >= 0) {
4125      if (Idx < (int)NumElems)
4126        Idx += NumElems;
4127      else
4128        Idx -= NumElems;
4129    }
4130    MaskVec.push_back(Idx);
4131  }
4132  return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
4133                              SVOp->getOperand(0), &MaskVec[0]);
4134}
4135
4136/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
4137/// match movhlps. The lower half elements should come from upper half of
4138/// V1 (and in order), and the upper half elements should come from the upper
4139/// half of V2 (and in order).
4140static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
4141  if (!VT.is128BitVector())
4142    return false;
4143  if (VT.getVectorNumElements() != 4)
4144    return false;
4145  for (unsigned i = 0, e = 2; i != e; ++i)
4146    if (!isUndefOrEqual(Mask[i], i+2))
4147      return false;
4148  for (unsigned i = 2; i != 4; ++i)
4149    if (!isUndefOrEqual(Mask[i], i+4))
4150      return false;
4151  return true;
4152}
4153
4154/// isScalarLoadToVector - Returns true if the node is a scalar load that
4155/// is promoted to a vector. It also returns the LoadSDNode by reference if
4156/// required.
4157static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
4158  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
4159    return false;
4160  N = N->getOperand(0).getNode();
4161  if (!ISD::isNON_EXTLoad(N))
4162    return false;
4163  if (LD)
4164    *LD = cast<LoadSDNode>(N);
4165  return true;
4166}
4167
4168// Test whether the given value is a vector value which will be legalized
4169// into a load.
4170static bool WillBeConstantPoolLoad(SDNode *N) {
4171  if (N->getOpcode() != ISD::BUILD_VECTOR)
4172    return false;
4173
4174  // Check for any non-constant elements.
4175  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
4176    switch (N->getOperand(i).getNode()->getOpcode()) {
4177    case ISD::UNDEF:
4178    case ISD::ConstantFP:
4179    case ISD::Constant:
4180      break;
4181    default:
4182      return false;
4183    }
4184
4185  // Vectors of all-zeros and all-ones are materialized with special
4186  // instructions rather than being loaded.
4187  return !ISD::isBuildVectorAllZeros(N) &&
4188         !ISD::isBuildVectorAllOnes(N);
4189}
4190
4191/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
4192/// match movlp{s|d}. The lower half elements should come from lower half of
4193/// V1 (and in order), and the upper half elements should come from the upper
4194/// half of V2 (and in order). And since V1 will become the source of the
4195/// MOVLP, it must be either a vector load or a scalar load to vector.
4196static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
4197                               ArrayRef<int> Mask, EVT VT) {
4198  if (!VT.is128BitVector())
4199    return false;
4200
4201  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
4202    return false;
4203  // Is V2 is a vector load, don't do this transformation. We will try to use
4204  // load folding shufps op.
4205  if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
4206    return false;
4207
4208  unsigned NumElems = VT.getVectorNumElements();
4209
4210  if (NumElems != 2 && NumElems != 4)
4211    return false;
4212  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4213    if (!isUndefOrEqual(Mask[i], i))
4214      return false;
4215  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4216    if (!isUndefOrEqual(Mask[i], i+NumElems))
4217      return false;
4218  return true;
4219}
4220
4221/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
4222/// all the same.
4223static bool isSplatVector(SDNode *N) {
4224  if (N->getOpcode() != ISD::BUILD_VECTOR)
4225    return false;
4226
4227  SDValue SplatValue = N->getOperand(0);
4228  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
4229    if (N->getOperand(i) != SplatValue)
4230      return false;
4231  return true;
4232}
4233
4234/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
4235/// to an zero vector.
4236/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
4237static bool isZeroShuffle(ShuffleVectorSDNode *N) {
4238  SDValue V1 = N->getOperand(0);
4239  SDValue V2 = N->getOperand(1);
4240  unsigned NumElems = N->getValueType(0).getVectorNumElements();
4241  for (unsigned i = 0; i != NumElems; ++i) {
4242    int Idx = N->getMaskElt(i);
4243    if (Idx >= (int)NumElems) {
4244      unsigned Opc = V2.getOpcode();
4245      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
4246        continue;
4247      if (Opc != ISD::BUILD_VECTOR ||
4248          !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
4249        return false;
4250    } else if (Idx >= 0) {
4251      unsigned Opc = V1.getOpcode();
4252      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
4253        continue;
4254      if (Opc != ISD::BUILD_VECTOR ||
4255          !X86::isZeroNode(V1.getOperand(Idx)))
4256        return false;
4257    }
4258  }
4259  return true;
4260}
4261
4262/// getZeroVector - Returns a vector of specified type with all zero elements.
4263///
4264static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
4265                             SelectionDAG &DAG, DebugLoc dl) {
4266  assert(VT.isVector() && "Expected a vector type");
4267  unsigned Size = VT.getSizeInBits();
4268
4269  // Always build SSE zero vectors as <4 x i32> bitcasted
4270  // to their dest type. This ensures they get CSE'd.
4271  SDValue Vec;
4272  if (Size == 128) {  // SSE
4273    if (Subtarget->hasSSE2()) {  // SSE2
4274      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
4275      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4276    } else { // SSE1
4277      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
4278      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
4279    }
4280  } else if (Size == 256) { // AVX
4281    if (Subtarget->hasAVX2()) { // AVX2
4282      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
4283      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4284      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
4285    } else {
4286      // 256-bit logic and arithmetic instructions in AVX are all
4287      // floating-point, no support for integer ops. Emit fp zeroed vectors.
4288      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
4289      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4290      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
4291    }
4292  } else
4293    llvm_unreachable("Unexpected vector type");
4294
4295  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
4296}
4297
4298/// getOnesVector - Returns a vector of specified type with all bits set.
4299/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
4300/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
4301/// Then bitcast to their original type, ensuring they get CSE'd.
4302static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG,
4303                             DebugLoc dl) {
4304  assert(VT.isVector() && "Expected a vector type");
4305  unsigned Size = VT.getSizeInBits();
4306
4307  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
4308  SDValue Vec;
4309  if (Size == 256) {
4310    if (HasAVX2) { // AVX2
4311      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
4312      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
4313    } else { // AVX
4314      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4315      Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
4316    }
4317  } else if (Size == 128) {
4318    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
4319  } else
4320    llvm_unreachable("Unexpected vector type");
4321
4322  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
4323}
4324
4325/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
4326/// that point to V2 points to its first element.
4327static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
4328  for (unsigned i = 0; i != NumElems; ++i) {
4329    if (Mask[i] > (int)NumElems) {
4330      Mask[i] = NumElems;
4331    }
4332  }
4333}
4334
4335/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
4336/// operation of specified width.
4337static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
4338                       SDValue V2) {
4339  unsigned NumElems = VT.getVectorNumElements();
4340  SmallVector<int, 8> Mask;
4341  Mask.push_back(NumElems);
4342  for (unsigned i = 1; i != NumElems; ++i)
4343    Mask.push_back(i);
4344  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4345}
4346
4347/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
4348static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
4349                          SDValue V2) {
4350  unsigned NumElems = VT.getVectorNumElements();
4351  SmallVector<int, 8> Mask;
4352  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
4353    Mask.push_back(i);
4354    Mask.push_back(i + NumElems);
4355  }
4356  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4357}
4358
4359/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
4360static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
4361                          SDValue V2) {
4362  unsigned NumElems = VT.getVectorNumElements();
4363  SmallVector<int, 8> Mask;
4364  for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
4365    Mask.push_back(i + Half);
4366    Mask.push_back(i + NumElems + Half);
4367  }
4368  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
4369}
4370
4371// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
4372// a generic shuffle instruction because the target has no such instructions.
4373// Generate shuffles which repeat i16 and i8 several times until they can be
4374// represented by v4f32 and then be manipulated by target suported shuffles.
4375static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
4376  EVT VT = V.getValueType();
4377  int NumElems = VT.getVectorNumElements();
4378  DebugLoc dl = V.getDebugLoc();
4379
4380  while (NumElems > 4) {
4381    if (EltNo < NumElems/2) {
4382      V = getUnpackl(DAG, dl, VT, V, V);
4383    } else {
4384      V = getUnpackh(DAG, dl, VT, V, V);
4385      EltNo -= NumElems/2;
4386    }
4387    NumElems >>= 1;
4388  }
4389  return V;
4390}
4391
4392/// getLegalSplat - Generate a legal splat with supported x86 shuffles
4393static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
4394  EVT VT = V.getValueType();
4395  DebugLoc dl = V.getDebugLoc();
4396  unsigned Size = VT.getSizeInBits();
4397
4398  if (Size == 128) {
4399    V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
4400    int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
4401    V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
4402                             &SplatMask[0]);
4403  } else if (Size == 256) {
4404    // To use VPERMILPS to splat scalars, the second half of indicies must
4405    // refer to the higher part, which is a duplication of the lower one,
4406    // because VPERMILPS can only handle in-lane permutations.
4407    int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
4408                         EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
4409
4410    V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
4411    V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
4412                             &SplatMask[0]);
4413  } else
4414    llvm_unreachable("Vector size not supported");
4415
4416  return DAG.getNode(ISD::BITCAST, dl, VT, V);
4417}
4418
4419/// PromoteSplat - Splat is promoted to target supported vector shuffles.
4420static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
4421  EVT SrcVT = SV->getValueType(0);
4422  SDValue V1 = SV->getOperand(0);
4423  DebugLoc dl = SV->getDebugLoc();
4424
4425  int EltNo = SV->getSplatIndex();
4426  int NumElems = SrcVT.getVectorNumElements();
4427  unsigned Size = SrcVT.getSizeInBits();
4428
4429  assert(((Size == 128 && NumElems > 4) || Size == 256) &&
4430          "Unknown how to promote splat for type");
4431
4432  // Extract the 128-bit part containing the splat element and update
4433  // the splat element index when it refers to the higher register.
4434  if (Size == 256) {
4435    V1 = Extract128BitVector(V1, EltNo, DAG, dl);
4436    if (EltNo >= NumElems/2)
4437      EltNo -= NumElems/2;
4438  }
4439
4440  // All i16 and i8 vector types can't be used directly by a generic shuffle
4441  // instruction because the target has no such instruction. Generate shuffles
4442  // which repeat i16 and i8 several times until they fit in i32, and then can
4443  // be manipulated by target suported shuffles.
4444  EVT EltVT = SrcVT.getVectorElementType();
4445  if (EltVT == MVT::i8 || EltVT == MVT::i16)
4446    V1 = PromoteSplati8i16(V1, DAG, EltNo);
4447
4448  // Recreate the 256-bit vector and place the same 128-bit vector
4449  // into the low and high part. This is necessary because we want
4450  // to use VPERM* to shuffle the vectors
4451  if (Size == 256) {
4452    V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
4453  }
4454
4455  return getLegalSplat(DAG, V1, EltNo);
4456}
4457
4458/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
4459/// vector of zero or undef vector.  This produces a shuffle where the low
4460/// element of V2 is swizzled into the zero/undef vector, landing at element
4461/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
4462static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
4463                                           bool IsZero,
4464                                           const X86Subtarget *Subtarget,
4465                                           SelectionDAG &DAG) {
4466  EVT VT = V2.getValueType();
4467  SDValue V1 = IsZero
4468    ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
4469  unsigned NumElems = VT.getVectorNumElements();
4470  SmallVector<int, 16> MaskVec;
4471  for (unsigned i = 0; i != NumElems; ++i)
4472    // If this is the insertion idx, put the low elt of V2 here.
4473    MaskVec.push_back(i == Idx ? NumElems : i);
4474  return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
4475}
4476
4477/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
4478/// target specific opcode. Returns true if the Mask could be calculated.
4479/// Sets IsUnary to true if only uses one source.
4480static bool getTargetShuffleMask(SDNode *N, MVT VT,
4481                                 SmallVectorImpl<int> &Mask, bool &IsUnary) {
4482  unsigned NumElems = VT.getVectorNumElements();
4483  SDValue ImmN;
4484
4485  IsUnary = false;
4486  switch(N->getOpcode()) {
4487  case X86ISD::SHUFP:
4488    ImmN = N->getOperand(N->getNumOperands()-1);
4489    DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4490    break;
4491  case X86ISD::UNPCKH:
4492    DecodeUNPCKHMask(VT, Mask);
4493    break;
4494  case X86ISD::UNPCKL:
4495    DecodeUNPCKLMask(VT, Mask);
4496    break;
4497  case X86ISD::MOVHLPS:
4498    DecodeMOVHLPSMask(NumElems, Mask);
4499    break;
4500  case X86ISD::MOVLHPS:
4501    DecodeMOVLHPSMask(NumElems, Mask);
4502    break;
4503  case X86ISD::PSHUFD:
4504  case X86ISD::VPERMILP:
4505    ImmN = N->getOperand(N->getNumOperands()-1);
4506    DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4507    IsUnary = true;
4508    break;
4509  case X86ISD::PSHUFHW:
4510    ImmN = N->getOperand(N->getNumOperands()-1);
4511    DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4512    IsUnary = true;
4513    break;
4514  case X86ISD::PSHUFLW:
4515    ImmN = N->getOperand(N->getNumOperands()-1);
4516    DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4517    IsUnary = true;
4518    break;
4519  case X86ISD::VPERMI:
4520    ImmN = N->getOperand(N->getNumOperands()-1);
4521    DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4522    IsUnary = true;
4523    break;
4524  case X86ISD::MOVSS:
4525  case X86ISD::MOVSD: {
4526    // The index 0 always comes from the first element of the second source,
4527    // this is why MOVSS and MOVSD are used in the first place. The other
4528    // elements come from the other positions of the first source vector
4529    Mask.push_back(NumElems);
4530    for (unsigned i = 1; i != NumElems; ++i) {
4531      Mask.push_back(i);
4532    }
4533    break;
4534  }
4535  case X86ISD::VPERM2X128:
4536    ImmN = N->getOperand(N->getNumOperands()-1);
4537    DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4538    if (Mask.empty()) return false;
4539    break;
4540  case X86ISD::MOVDDUP:
4541  case X86ISD::MOVLHPD:
4542  case X86ISD::MOVLPD:
4543  case X86ISD::MOVLPS:
4544  case X86ISD::MOVSHDUP:
4545  case X86ISD::MOVSLDUP:
4546  case X86ISD::PALIGN:
4547    // Not yet implemented
4548    return false;
4549  default: llvm_unreachable("unknown target shuffle node");
4550  }
4551
4552  return true;
4553}
4554
4555/// getShuffleScalarElt - Returns the scalar element that will make up the ith
4556/// element of the result of the vector shuffle.
4557static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
4558                                   unsigned Depth) {
4559  if (Depth == 6)
4560    return SDValue();  // Limit search depth.
4561
4562  SDValue V = SDValue(N, 0);
4563  EVT VT = V.getValueType();
4564  unsigned Opcode = V.getOpcode();
4565
4566  // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
4567  if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
4568    int Elt = SV->getMaskElt(Index);
4569
4570    if (Elt < 0)
4571      return DAG.getUNDEF(VT.getVectorElementType());
4572
4573    unsigned NumElems = VT.getVectorNumElements();
4574    SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
4575                                         : SV->getOperand(1);
4576    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
4577  }
4578
4579  // Recurse into target specific vector shuffles to find scalars.
4580  if (isTargetShuffle(Opcode)) {
4581    MVT ShufVT = V.getValueType().getSimpleVT();
4582    unsigned NumElems = ShufVT.getVectorNumElements();
4583    SmallVector<int, 16> ShuffleMask;
4584    SDValue ImmN;
4585    bool IsUnary;
4586
4587    if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
4588      return SDValue();
4589
4590    int Elt = ShuffleMask[Index];
4591    if (Elt < 0)
4592      return DAG.getUNDEF(ShufVT.getVectorElementType());
4593
4594    SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
4595                                         : N->getOperand(1);
4596    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
4597                               Depth+1);
4598  }
4599
4600  // Actual nodes that may contain scalar elements
4601  if (Opcode == ISD::BITCAST) {
4602    V = V.getOperand(0);
4603    EVT SrcVT = V.getValueType();
4604    unsigned NumElems = VT.getVectorNumElements();
4605
4606    if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
4607      return SDValue();
4608  }
4609
4610  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
4611    return (Index == 0) ? V.getOperand(0)
4612                        : DAG.getUNDEF(VT.getVectorElementType());
4613
4614  if (V.getOpcode() == ISD::BUILD_VECTOR)
4615    return V.getOperand(Index);
4616
4617  return SDValue();
4618}
4619
4620/// getNumOfConsecutiveZeros - Return the number of elements of a vector
4621/// shuffle operation which come from a consecutively from a zero. The
4622/// search can start in two different directions, from left or right.
4623static
4624unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems,
4625                                  bool ZerosFromLeft, SelectionDAG &DAG) {
4626  unsigned i;
4627  for (i = 0; i != NumElems; ++i) {
4628    unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
4629    SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
4630    if (!(Elt.getNode() &&
4631         (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
4632      break;
4633  }
4634
4635  return i;
4636}
4637
4638/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
4639/// correspond consecutively to elements from one of the vector operands,
4640/// starting from its index OpIdx. Also tell OpNum which source vector operand.
4641static
4642bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
4643                              unsigned MaskI, unsigned MaskE, unsigned OpIdx,
4644                              unsigned NumElems, unsigned &OpNum) {
4645  bool SeenV1 = false;
4646  bool SeenV2 = false;
4647
4648  for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
4649    int Idx = SVOp->getMaskElt(i);
4650    // Ignore undef indicies
4651    if (Idx < 0)
4652      continue;
4653
4654    if (Idx < (int)NumElems)
4655      SeenV1 = true;
4656    else
4657      SeenV2 = true;
4658
4659    // Only accept consecutive elements from the same vector
4660    if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
4661      return false;
4662  }
4663
4664  OpNum = SeenV1 ? 0 : 1;
4665  return true;
4666}
4667
4668/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
4669/// logical left shift of a vector.
4670static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
4671                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
4672  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
4673  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
4674              false /* check zeros from right */, DAG);
4675  unsigned OpSrc;
4676
4677  if (!NumZeros)
4678    return false;
4679
4680  // Considering the elements in the mask that are not consecutive zeros,
4681  // check if they consecutively come from only one of the source vectors.
4682  //
4683  //               V1 = {X, A, B, C}     0
4684  //                         \  \  \    /
4685  //   vector_shuffle V1, V2 <1, 2, 3, X>
4686  //
4687  if (!isShuffleMaskConsecutive(SVOp,
4688            0,                   // Mask Start Index
4689            NumElems-NumZeros,   // Mask End Index(exclusive)
4690            NumZeros,            // Where to start looking in the src vector
4691            NumElems,            // Number of elements in vector
4692            OpSrc))              // Which source operand ?
4693    return false;
4694
4695  isLeft = false;
4696  ShAmt = NumZeros;
4697  ShVal = SVOp->getOperand(OpSrc);
4698  return true;
4699}
4700
4701/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
4702/// logical left shift of a vector.
4703static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
4704                              bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
4705  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
4706  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
4707              true /* check zeros from left */, DAG);
4708  unsigned OpSrc;
4709
4710  if (!NumZeros)
4711    return false;
4712
4713  // Considering the elements in the mask that are not consecutive zeros,
4714  // check if they consecutively come from only one of the source vectors.
4715  //
4716  //                           0    { A, B, X, X } = V2
4717  //                          / \    /  /
4718  //   vector_shuffle V1, V2 <X, X, 4, 5>
4719  //
4720  if (!isShuffleMaskConsecutive(SVOp,
4721            NumZeros,     // Mask Start Index
4722            NumElems,     // Mask End Index(exclusive)
4723            0,            // Where to start looking in the src vector
4724            NumElems,     // Number of elements in vector
4725            OpSrc))       // Which source operand ?
4726    return false;
4727
4728  isLeft = true;
4729  ShAmt = NumZeros;
4730  ShVal = SVOp->getOperand(OpSrc);
4731  return true;
4732}
4733
4734/// isVectorShift - Returns true if the shuffle can be implemented as a
4735/// logical left or right shift of a vector.
4736static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
4737                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
4738  // Although the logic below support any bitwidth size, there are no
4739  // shift instructions which handle more than 128-bit vectors.
4740  if (!SVOp->getValueType(0).is128BitVector())
4741    return false;
4742
4743  if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
4744      isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
4745    return true;
4746
4747  return false;
4748}
4749
4750/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
4751///
4752static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
4753                                       unsigned NumNonZero, unsigned NumZero,
4754                                       SelectionDAG &DAG,
4755                                       const X86Subtarget* Subtarget,
4756                                       const TargetLowering &TLI) {
4757  if (NumNonZero > 8)
4758    return SDValue();
4759
4760  DebugLoc dl = Op.getDebugLoc();
4761  SDValue V(0, 0);
4762  bool First = true;
4763  for (unsigned i = 0; i < 16; ++i) {
4764    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
4765    if (ThisIsNonZero && First) {
4766      if (NumZero)
4767        V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
4768      else
4769        V = DAG.getUNDEF(MVT::v8i16);
4770      First = false;
4771    }
4772
4773    if ((i & 1) != 0) {
4774      SDValue ThisElt(0, 0), LastElt(0, 0);
4775      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
4776      if (LastIsNonZero) {
4777        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
4778                              MVT::i16, Op.getOperand(i-1));
4779      }
4780      if (ThisIsNonZero) {
4781        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
4782        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
4783                              ThisElt, DAG.getConstant(8, MVT::i8));
4784        if (LastIsNonZero)
4785          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
4786      } else
4787        ThisElt = LastElt;
4788
4789      if (ThisElt.getNode())
4790        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
4791                        DAG.getIntPtrConstant(i/2));
4792    }
4793  }
4794
4795  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
4796}
4797
4798/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
4799///
4800static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
4801                                     unsigned NumNonZero, unsigned NumZero,
4802                                     SelectionDAG &DAG,
4803                                     const X86Subtarget* Subtarget,
4804                                     const TargetLowering &TLI) {
4805  if (NumNonZero > 4)
4806    return SDValue();
4807
4808  DebugLoc dl = Op.getDebugLoc();
4809  SDValue V(0, 0);
4810  bool First = true;
4811  for (unsigned i = 0; i < 8; ++i) {
4812    bool isNonZero = (NonZeros & (1 << i)) != 0;
4813    if (isNonZero) {
4814      if (First) {
4815        if (NumZero)
4816          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
4817        else
4818          V = DAG.getUNDEF(MVT::v8i16);
4819        First = false;
4820      }
4821      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
4822                      MVT::v8i16, V, Op.getOperand(i),
4823                      DAG.getIntPtrConstant(i));
4824    }
4825  }
4826
4827  return V;
4828}
4829
4830/// getVShift - Return a vector logical shift node.
4831///
4832static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
4833                         unsigned NumBits, SelectionDAG &DAG,
4834                         const TargetLowering &TLI, DebugLoc dl) {
4835  assert(VT.is128BitVector() && "Unknown type for VShift");
4836  EVT ShVT = MVT::v2i64;
4837  unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
4838  SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
4839  return DAG.getNode(ISD::BITCAST, dl, VT,
4840                     DAG.getNode(Opc, dl, ShVT, SrcOp,
4841                             DAG.getConstant(NumBits,
4842                                  TLI.getShiftAmountTy(SrcOp.getValueType()))));
4843}
4844
4845SDValue
4846X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
4847                                          SelectionDAG &DAG) const {
4848
4849  // Check if the scalar load can be widened into a vector load. And if
4850  // the address is "base + cst" see if the cst can be "absorbed" into
4851  // the shuffle mask.
4852  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
4853    SDValue Ptr = LD->getBasePtr();
4854    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
4855      return SDValue();
4856    EVT PVT = LD->getValueType(0);
4857    if (PVT != MVT::i32 && PVT != MVT::f32)
4858      return SDValue();
4859
4860    int FI = -1;
4861    int64_t Offset = 0;
4862    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
4863      FI = FINode->getIndex();
4864      Offset = 0;
4865    } else if (DAG.isBaseWithConstantOffset(Ptr) &&
4866               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
4867      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
4868      Offset = Ptr.getConstantOperandVal(1);
4869      Ptr = Ptr.getOperand(0);
4870    } else {
4871      return SDValue();
4872    }
4873
4874    // FIXME: 256-bit vector instructions don't require a strict alignment,
4875    // improve this code to support it better.
4876    unsigned RequiredAlign = VT.getSizeInBits()/8;
4877    SDValue Chain = LD->getChain();
4878    // Make sure the stack object alignment is at least 16 or 32.
4879    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
4880    if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
4881      if (MFI->isFixedObjectIndex(FI)) {
4882        // Can't change the alignment. FIXME: It's possible to compute
4883        // the exact stack offset and reference FI + adjust offset instead.
4884        // If someone *really* cares about this. That's the way to implement it.
4885        return SDValue();
4886      } else {
4887        MFI->setObjectAlignment(FI, RequiredAlign);
4888      }
4889    }
4890
4891    // (Offset % 16 or 32) must be multiple of 4. Then address is then
4892    // Ptr + (Offset & ~15).
4893    if (Offset < 0)
4894      return SDValue();
4895    if ((Offset % RequiredAlign) & 3)
4896      return SDValue();
4897    int64_t StartOffset = Offset & ~(RequiredAlign-1);
4898    if (StartOffset)
4899      Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
4900                        Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
4901
4902    int EltNo = (Offset - StartOffset) >> 2;
4903    unsigned NumElems = VT.getVectorNumElements();
4904
4905    EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
4906    SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
4907                             LD->getPointerInfo().getWithOffset(StartOffset),
4908                             false, false, false, 0);
4909
4910    SmallVector<int, 8> Mask;
4911    for (unsigned i = 0; i != NumElems; ++i)
4912      Mask.push_back(EltNo);
4913
4914    return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
4915  }
4916
4917  return SDValue();
4918}
4919
4920/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
4921/// vector of type 'VT', see if the elements can be replaced by a single large
4922/// load which has the same value as a build_vector whose operands are 'elts'.
4923///
4924/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
4925///
4926/// FIXME: we'd also like to handle the case where the last elements are zero
4927/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
4928/// There's even a handy isZeroNode for that purpose.
4929static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
4930                                        DebugLoc &DL, SelectionDAG &DAG) {
4931  EVT EltVT = VT.getVectorElementType();
4932  unsigned NumElems = Elts.size();
4933
4934  LoadSDNode *LDBase = NULL;
4935  unsigned LastLoadedElt = -1U;
4936
4937  // For each element in the initializer, see if we've found a load or an undef.
4938  // If we don't find an initial load element, or later load elements are
4939  // non-consecutive, bail out.
4940  for (unsigned i = 0; i < NumElems; ++i) {
4941    SDValue Elt = Elts[i];
4942
4943    if (!Elt.getNode() ||
4944        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
4945      return SDValue();
4946    if (!LDBase) {
4947      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
4948        return SDValue();
4949      LDBase = cast<LoadSDNode>(Elt.getNode());
4950      LastLoadedElt = i;
4951      continue;
4952    }
4953    if (Elt.getOpcode() == ISD::UNDEF)
4954      continue;
4955
4956    LoadSDNode *LD = cast<LoadSDNode>(Elt);
4957    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
4958      return SDValue();
4959    LastLoadedElt = i;
4960  }
4961
4962  // If we have found an entire vector of loads and undefs, then return a large
4963  // load of the entire vector width starting at the base pointer.  If we found
4964  // consecutive loads for the low half, generate a vzext_load node.
4965  if (LastLoadedElt == NumElems - 1) {
4966    if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
4967      return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
4968                         LDBase->getPointerInfo(),
4969                         LDBase->isVolatile(), LDBase->isNonTemporal(),
4970                         LDBase->isInvariant(), 0);
4971    return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
4972                       LDBase->getPointerInfo(),
4973                       LDBase->isVolatile(), LDBase->isNonTemporal(),
4974                       LDBase->isInvariant(), LDBase->getAlignment());
4975  }
4976  if (NumElems == 4 && LastLoadedElt == 1 &&
4977      DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
4978    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
4979    SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
4980    SDValue ResNode =
4981        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64,
4982                                LDBase->getPointerInfo(),
4983                                LDBase->getAlignment(),
4984                                false/*isVolatile*/, true/*ReadMem*/,
4985                                false/*WriteMem*/);
4986
4987    // Make sure the newly-created LOAD is in the same position as LDBase in
4988    // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
4989    // update uses of LDBase's output chain to use the TokenFactor.
4990    if (LDBase->hasAnyUseOfValue(1)) {
4991      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
4992                             SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
4993      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
4994      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
4995                             SDValue(ResNode.getNode(), 1));
4996    }
4997
4998    return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
4999  }
5000  return SDValue();
5001}
5002
5003/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
5004/// to generate a splat value for the following cases:
5005/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
5006/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
5007/// a scalar load, or a constant.
5008/// The VBROADCAST node is returned when a pattern is found,
5009/// or SDValue() otherwise.
5010SDValue
5011X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
5012  if (!Subtarget->hasAVX())
5013    return SDValue();
5014
5015  EVT VT = Op.getValueType();
5016  DebugLoc dl = Op.getDebugLoc();
5017
5018  assert((VT.is128BitVector() || VT.is256BitVector()) &&
5019         "Unsupported vector type for broadcast.");
5020
5021  SDValue Ld;
5022  bool ConstSplatVal;
5023
5024  switch (Op.getOpcode()) {
5025    default:
5026      // Unknown pattern found.
5027      return SDValue();
5028
5029    case ISD::BUILD_VECTOR: {
5030      // The BUILD_VECTOR node must be a splat.
5031      if (!isSplatVector(Op.getNode()))
5032        return SDValue();
5033
5034      Ld = Op.getOperand(0);
5035      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5036                     Ld.getOpcode() == ISD::ConstantFP);
5037
5038      // The suspected load node has several users. Make sure that all
5039      // of its users are from the BUILD_VECTOR node.
5040      // Constants may have multiple users.
5041      if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
5042        return SDValue();
5043      break;
5044    }
5045
5046    case ISD::VECTOR_SHUFFLE: {
5047      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5048
5049      // Shuffles must have a splat mask where the first element is
5050      // broadcasted.
5051      if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
5052        return SDValue();
5053
5054      SDValue Sc = Op.getOperand(0);
5055      if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
5056          Sc.getOpcode() != ISD::BUILD_VECTOR) {
5057
5058        if (!Subtarget->hasAVX2())
5059          return SDValue();
5060
5061        // Use the register form of the broadcast instruction available on AVX2.
5062        if (VT.is256BitVector())
5063          Sc = Extract128BitVector(Sc, 0, DAG, dl);
5064        return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
5065      }
5066
5067      Ld = Sc.getOperand(0);
5068      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5069                       Ld.getOpcode() == ISD::ConstantFP);
5070
5071      // The scalar_to_vector node and the suspected
5072      // load node must have exactly one user.
5073      // Constants may have multiple users.
5074      if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse()))
5075        return SDValue();
5076      break;
5077    }
5078  }
5079
5080  bool Is256 = VT.is256BitVector();
5081
5082  // Handle the broadcasting a single constant scalar from the constant pool
5083  // into a vector. On Sandybridge it is still better to load a constant vector
5084  // from the constant pool and not to broadcast it from a scalar.
5085  if (ConstSplatVal && Subtarget->hasAVX2()) {
5086    EVT CVT = Ld.getValueType();
5087    assert(!CVT.isVector() && "Must not broadcast a vector type");
5088    unsigned ScalarSize = CVT.getSizeInBits();
5089
5090    if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) {
5091      const Constant *C = 0;
5092      if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
5093        C = CI->getConstantIntValue();
5094      else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
5095        C = CF->getConstantFPValue();
5096
5097      assert(C && "Invalid constant type");
5098
5099      SDValue CP = DAG.getConstantPool(C, getPointerTy());
5100      unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
5101      Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
5102                       MachinePointerInfo::getConstantPool(),
5103                       false, false, false, Alignment);
5104
5105      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5106    }
5107  }
5108
5109  bool IsLoad = ISD::isNormalLoad(Ld.getNode());
5110  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
5111
5112  // Handle AVX2 in-register broadcasts.
5113  if (!IsLoad && Subtarget->hasAVX2() &&
5114      (ScalarSize == 32 || (Is256 && ScalarSize == 64)))
5115    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5116
5117  // The scalar source must be a normal load.
5118  if (!IsLoad)
5119    return SDValue();
5120
5121  if (ScalarSize == 32 || (Is256 && ScalarSize == 64))
5122    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5123
5124  // The integer check is needed for the 64-bit into 128-bit so it doesn't match
5125  // double since there is no vbroadcastsd xmm
5126  if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) {
5127    if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
5128      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5129  }
5130
5131  // Unsupported broadcast.
5132  return SDValue();
5133}
5134
5135// LowerVectorFpExtend - Recognize the scalarized FP_EXTEND from v2f32 to v2f64
5136// and convert it into X86ISD::VFPEXT due to the current ISD::FP_EXTEND has the
5137// constraint of matching input/output vector elements.
5138SDValue
5139X86TargetLowering::LowerVectorFpExtend(SDValue &Op, SelectionDAG &DAG) const {
5140  DebugLoc DL = Op.getDebugLoc();
5141  SDNode *N = Op.getNode();
5142  EVT VT = Op.getValueType();
5143  unsigned NumElts = Op.getNumOperands();
5144
5145  // Check supported types and sub-targets.
5146  //
5147  // Only v2f32 -> v2f64 needs special handling.
5148  if (VT != MVT::v2f64 || !Subtarget->hasSSE2())
5149    return SDValue();
5150
5151  SDValue VecIn;
5152  EVT VecInVT;
5153  SmallVector<int, 8> Mask;
5154  EVT SrcVT = MVT::Other;
5155
5156  // Check the patterns could be translated into X86vfpext.
5157  for (unsigned i = 0; i < NumElts; ++i) {
5158    SDValue In = N->getOperand(i);
5159    unsigned Opcode = In.getOpcode();
5160
5161    // Skip if the element is undefined.
5162    if (Opcode == ISD::UNDEF) {
5163      Mask.push_back(-1);
5164      continue;
5165    }
5166
5167    // Quit if one of the elements is not defined from 'fpext'.
5168    if (Opcode != ISD::FP_EXTEND)
5169      return SDValue();
5170
5171    // Check how the source of 'fpext' is defined.
5172    SDValue L2In = In.getOperand(0);
5173    EVT L2InVT = L2In.getValueType();
5174
5175    // Check the original type
5176    if (SrcVT == MVT::Other)
5177      SrcVT = L2InVT;
5178    else if (SrcVT != L2InVT) // Quit if non-homogenous typed.
5179      return SDValue();
5180
5181    // Check whether the value being 'fpext'ed is extracted from the same
5182    // source.
5183    Opcode = L2In.getOpcode();
5184
5185    // Quit if it's not extracted with a constant index.
5186    if (Opcode != ISD::EXTRACT_VECTOR_ELT ||
5187        !isa<ConstantSDNode>(L2In.getOperand(1)))
5188      return SDValue();
5189
5190    SDValue ExtractedFromVec = L2In.getOperand(0);
5191
5192    if (VecIn.getNode() == 0) {
5193      VecIn = ExtractedFromVec;
5194      VecInVT = ExtractedFromVec.getValueType();
5195    } else if (VecIn != ExtractedFromVec) // Quit if built from more than 1 vec.
5196      return SDValue();
5197
5198    Mask.push_back(cast<ConstantSDNode>(L2In.getOperand(1))->getZExtValue());
5199  }
5200
5201  // Quit if all operands of BUILD_VECTOR are undefined.
5202  if (!VecIn.getNode())
5203    return SDValue();
5204
5205  // Fill the remaining mask as undef.
5206  for (unsigned i = NumElts; i < VecInVT.getVectorNumElements(); ++i)
5207    Mask.push_back(-1);
5208
5209  return DAG.getNode(X86ISD::VFPEXT, DL, VT,
5210                     DAG.getVectorShuffle(VecInVT, DL,
5211                                          VecIn, DAG.getUNDEF(VecInVT),
5212                                          &Mask[0]));
5213}
5214
5215SDValue
5216X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
5217  DebugLoc dl = Op.getDebugLoc();
5218
5219  EVT VT = Op.getValueType();
5220  EVT ExtVT = VT.getVectorElementType();
5221  unsigned NumElems = Op.getNumOperands();
5222
5223  // Vectors containing all zeros can be matched by pxor and xorps later
5224  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
5225    // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
5226    // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
5227    if (VT == MVT::v4i32 || VT == MVT::v8i32)
5228      return Op;
5229
5230    return getZeroVector(VT, Subtarget, DAG, dl);
5231  }
5232
5233  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
5234  // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
5235  // vpcmpeqd on 256-bit vectors.
5236  if (ISD::isBuildVectorAllOnes(Op.getNode())) {
5237    if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasAVX2()))
5238      return Op;
5239
5240    return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl);
5241  }
5242
5243  SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
5244  if (Broadcast.getNode())
5245    return Broadcast;
5246
5247  SDValue FpExt = LowerVectorFpExtend(Op, DAG);
5248  if (FpExt.getNode())
5249    return FpExt;
5250
5251  unsigned EVTBits = ExtVT.getSizeInBits();
5252
5253  unsigned NumZero  = 0;
5254  unsigned NumNonZero = 0;
5255  unsigned NonZeros = 0;
5256  bool IsAllConstants = true;
5257  SmallSet<SDValue, 8> Values;
5258  for (unsigned i = 0; i < NumElems; ++i) {
5259    SDValue Elt = Op.getOperand(i);
5260    if (Elt.getOpcode() == ISD::UNDEF)
5261      continue;
5262    Values.insert(Elt);
5263    if (Elt.getOpcode() != ISD::Constant &&
5264        Elt.getOpcode() != ISD::ConstantFP)
5265      IsAllConstants = false;
5266    if (X86::isZeroNode(Elt))
5267      NumZero++;
5268    else {
5269      NonZeros |= (1 << i);
5270      NumNonZero++;
5271    }
5272  }
5273
5274  // All undef vector. Return an UNDEF.  All zero vectors were handled above.
5275  if (NumNonZero == 0)
5276    return DAG.getUNDEF(VT);
5277
5278  // Special case for single non-zero, non-undef, element.
5279  if (NumNonZero == 1) {
5280    unsigned Idx = CountTrailingZeros_32(NonZeros);
5281    SDValue Item = Op.getOperand(Idx);
5282
5283    // If this is an insertion of an i64 value on x86-32, and if the top bits of
5284    // the value are obviously zero, truncate the value to i32 and do the
5285    // insertion that way.  Only do this if the value is non-constant or if the
5286    // value is a constant being inserted into element 0.  It is cheaper to do
5287    // a constant pool load than it is to do a movd + shuffle.
5288    if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
5289        (!IsAllConstants || Idx == 0)) {
5290      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
5291        // Handle SSE only.
5292        assert(VT == MVT::v2i64 && "Expected an SSE value type!");
5293        EVT VecVT = MVT::v4i32;
5294        unsigned VecElts = 4;
5295
5296        // Truncate the value (which may itself be a constant) to i32, and
5297        // convert it to a vector with movd (S2V+shuffle to zero extend).
5298        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
5299        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
5300        Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5301
5302        // Now we have our 32-bit value zero extended in the low element of
5303        // a vector.  If Idx != 0, swizzle it into place.
5304        if (Idx != 0) {
5305          SmallVector<int, 4> Mask;
5306          Mask.push_back(Idx);
5307          for (unsigned i = 1; i != VecElts; ++i)
5308            Mask.push_back(i);
5309          Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
5310                                      &Mask[0]);
5311        }
5312        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
5313      }
5314    }
5315
5316    // If we have a constant or non-constant insertion into the low element of
5317    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
5318    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
5319    // depending on what the source datatype is.
5320    if (Idx == 0) {
5321      if (NumZero == 0)
5322        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5323
5324      if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
5325          (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
5326        if (VT.is256BitVector()) {
5327          SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
5328          return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
5329                             Item, DAG.getIntPtrConstant(0));
5330        }
5331        assert(VT.is128BitVector() && "Expected an SSE value type!");
5332        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5333        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
5334        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5335      }
5336
5337      if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
5338        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
5339        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
5340        if (VT.is256BitVector()) {
5341          SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
5342          Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
5343        } else {
5344          assert(VT.is128BitVector() && "Expected an SSE value type!");
5345          Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
5346        }
5347        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
5348      }
5349    }
5350
5351    // Is it a vector logical left shift?
5352    if (NumElems == 2 && Idx == 1 &&
5353        X86::isZeroNode(Op.getOperand(0)) &&
5354        !X86::isZeroNode(Op.getOperand(1))) {
5355      unsigned NumBits = VT.getSizeInBits();
5356      return getVShift(true, VT,
5357                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
5358                                   VT, Op.getOperand(1)),
5359                       NumBits/2, DAG, *this, dl);
5360    }
5361
5362    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
5363      return SDValue();
5364
5365    // Otherwise, if this is a vector with i32 or f32 elements, and the element
5366    // is a non-constant being inserted into an element other than the low one,
5367    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
5368    // movd/movss) to move this into the low element, then shuffle it into
5369    // place.
5370    if (EVTBits == 32) {
5371      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
5372
5373      // Turn it into a shuffle of zero and zero-extended scalar to vector.
5374      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
5375      SmallVector<int, 8> MaskVec;
5376      for (unsigned i = 0; i != NumElems; ++i)
5377        MaskVec.push_back(i == Idx ? 0 : 1);
5378      return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
5379    }
5380  }
5381
5382  // Splat is obviously ok. Let legalizer expand it to a shuffle.
5383  if (Values.size() == 1) {
5384    if (EVTBits == 32) {
5385      // Instead of a shuffle like this:
5386      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
5387      // Check if it's possible to issue this instead.
5388      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
5389      unsigned Idx = CountTrailingZeros_32(NonZeros);
5390      SDValue Item = Op.getOperand(Idx);
5391      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
5392        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
5393    }
5394    return SDValue();
5395  }
5396
5397  // A vector full of immediates; various special cases are already
5398  // handled, so this is best done with a single constant-pool load.
5399  if (IsAllConstants)
5400    return SDValue();
5401
5402  // For AVX-length vectors, build the individual 128-bit pieces and use
5403  // shuffles to put them in place.
5404  if (VT.is256BitVector()) {
5405    SmallVector<SDValue, 32> V;
5406    for (unsigned i = 0; i != NumElems; ++i)
5407      V.push_back(Op.getOperand(i));
5408
5409    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
5410
5411    // Build both the lower and upper subvector.
5412    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
5413    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
5414                                NumElems/2);
5415
5416    // Recreate the wider vector with the lower and upper part.
5417    return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
5418  }
5419
5420  // Let legalizer expand 2-wide build_vectors.
5421  if (EVTBits == 64) {
5422    if (NumNonZero == 1) {
5423      // One half is zero or undef.
5424      unsigned Idx = CountTrailingZeros_32(NonZeros);
5425      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
5426                                 Op.getOperand(Idx));
5427      return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
5428    }
5429    return SDValue();
5430  }
5431
5432  // If element VT is < 32 bits, convert it to inserts into a zero vector.
5433  if (EVTBits == 8 && NumElems == 16) {
5434    SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
5435                                        Subtarget, *this);
5436    if (V.getNode()) return V;
5437  }
5438
5439  if (EVTBits == 16 && NumElems == 8) {
5440    SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
5441                                      Subtarget, *this);
5442    if (V.getNode()) return V;
5443  }
5444
5445  // If element VT is == 32 bits, turn it into a number of shuffles.
5446  SmallVector<SDValue, 8> V(NumElems);
5447  if (NumElems == 4 && NumZero > 0) {
5448    for (unsigned i = 0; i < 4; ++i) {
5449      bool isZero = !(NonZeros & (1 << i));
5450      if (isZero)
5451        V[i] = getZeroVector(VT, Subtarget, DAG, dl);
5452      else
5453        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
5454    }
5455
5456    for (unsigned i = 0; i < 2; ++i) {
5457      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
5458        default: break;
5459        case 0:
5460          V[i] = V[i*2];  // Must be a zero vector.
5461          break;
5462        case 1:
5463          V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
5464          break;
5465        case 2:
5466          V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
5467          break;
5468        case 3:
5469          V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
5470          break;
5471      }
5472    }
5473
5474    bool Reverse1 = (NonZeros & 0x3) == 2;
5475    bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
5476    int MaskVec[] = {
5477      Reverse1 ? 1 : 0,
5478      Reverse1 ? 0 : 1,
5479      static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
5480      static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
5481    };
5482    return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
5483  }
5484
5485  if (Values.size() > 1 && VT.is128BitVector()) {
5486    // Check for a build vector of consecutive loads.
5487    for (unsigned i = 0; i < NumElems; ++i)
5488      V[i] = Op.getOperand(i);
5489
5490    // Check for elements which are consecutive loads.
5491    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
5492    if (LD.getNode())
5493      return LD;
5494
5495    // For SSE 4.1, use insertps to put the high elements into the low element.
5496    if (getSubtarget()->hasSSE41()) {
5497      SDValue Result;
5498      if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
5499        Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
5500      else
5501        Result = DAG.getUNDEF(VT);
5502
5503      for (unsigned i = 1; i < NumElems; ++i) {
5504        if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
5505        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
5506                             Op.getOperand(i), DAG.getIntPtrConstant(i));
5507      }
5508      return Result;
5509    }
5510
5511    // Otherwise, expand into a number of unpckl*, start by extending each of
5512    // our (non-undef) elements to the full vector width with the element in the
5513    // bottom slot of the vector (which generates no code for SSE).
5514    for (unsigned i = 0; i < NumElems; ++i) {
5515      if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
5516        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
5517      else
5518        V[i] = DAG.getUNDEF(VT);
5519    }
5520
5521    // Next, we iteratively mix elements, e.g. for v4f32:
5522    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
5523    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
5524    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
5525    unsigned EltStride = NumElems >> 1;
5526    while (EltStride != 0) {
5527      for (unsigned i = 0; i < EltStride; ++i) {
5528        // If V[i+EltStride] is undef and this is the first round of mixing,
5529        // then it is safe to just drop this shuffle: V[i] is already in the
5530        // right place, the one element (since it's the first round) being
5531        // inserted as undef can be dropped.  This isn't safe for successive
5532        // rounds because they will permute elements within both vectors.
5533        if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
5534            EltStride == NumElems/2)
5535          continue;
5536
5537        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
5538      }
5539      EltStride >>= 1;
5540    }
5541    return V[0];
5542  }
5543  return SDValue();
5544}
5545
5546// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
5547// to create 256-bit vectors from two other 128-bit ones.
5548static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
5549  DebugLoc dl = Op.getDebugLoc();
5550  EVT ResVT = Op.getValueType();
5551
5552  assert(ResVT.is256BitVector() && "Value type must be 256-bit wide");
5553
5554  SDValue V1 = Op.getOperand(0);
5555  SDValue V2 = Op.getOperand(1);
5556  unsigned NumElems = ResVT.getVectorNumElements();
5557
5558  return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
5559}
5560
5561SDValue
5562X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
5563  assert(Op.getNumOperands() == 2);
5564
5565  // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
5566  // from two other 128-bit ones.
5567  return LowerAVXCONCAT_VECTORS(Op, DAG);
5568}
5569
5570// Try to lower a shuffle node into a simple blend instruction.
5571static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
5572                                          const X86Subtarget *Subtarget,
5573                                          SelectionDAG &DAG) {
5574  SDValue V1 = SVOp->getOperand(0);
5575  SDValue V2 = SVOp->getOperand(1);
5576  DebugLoc dl = SVOp->getDebugLoc();
5577  MVT VT = SVOp->getValueType(0).getSimpleVT();
5578  unsigned NumElems = VT.getVectorNumElements();
5579
5580  if (!Subtarget->hasSSE41())
5581    return SDValue();
5582
5583  unsigned ISDNo = 0;
5584  MVT OpTy;
5585
5586  switch (VT.SimpleTy) {
5587  default: return SDValue();
5588  case MVT::v8i16:
5589    ISDNo = X86ISD::BLENDPW;
5590    OpTy = MVT::v8i16;
5591    break;
5592  case MVT::v4i32:
5593  case MVT::v4f32:
5594    ISDNo = X86ISD::BLENDPS;
5595    OpTy = MVT::v4f32;
5596    break;
5597  case MVT::v2i64:
5598  case MVT::v2f64:
5599    ISDNo = X86ISD::BLENDPD;
5600    OpTy = MVT::v2f64;
5601    break;
5602  case MVT::v8i32:
5603  case MVT::v8f32:
5604    if (!Subtarget->hasAVX())
5605      return SDValue();
5606    ISDNo = X86ISD::BLENDPS;
5607    OpTy = MVT::v8f32;
5608    break;
5609  case MVT::v4i64:
5610  case MVT::v4f64:
5611    if (!Subtarget->hasAVX())
5612      return SDValue();
5613    ISDNo = X86ISD::BLENDPD;
5614    OpTy = MVT::v4f64;
5615    break;
5616  }
5617  assert(ISDNo && "Invalid Op Number");
5618
5619  unsigned MaskVals = 0;
5620
5621  for (unsigned i = 0; i != NumElems; ++i) {
5622    int EltIdx = SVOp->getMaskElt(i);
5623    if (EltIdx == (int)i || EltIdx < 0)
5624      MaskVals |= (1<<i);
5625    else if (EltIdx == (int)(i + NumElems))
5626      continue; // Bit is set to zero;
5627    else
5628      return SDValue();
5629  }
5630
5631  V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1);
5632  V2 = DAG.getNode(ISD::BITCAST, dl, OpTy, V2);
5633  SDValue Ret =  DAG.getNode(ISDNo, dl, OpTy, V1, V2,
5634                             DAG.getConstant(MaskVals, MVT::i32));
5635  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
5636}
5637
5638// v8i16 shuffles - Prefer shuffles in the following order:
5639// 1. [all]   pshuflw, pshufhw, optional move
5640// 2. [ssse3] 1 x pshufb
5641// 3. [ssse3] 2 x pshufb + 1 x por
5642// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
5643SDValue
5644X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
5645                                            SelectionDAG &DAG) const {
5646  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5647  SDValue V1 = SVOp->getOperand(0);
5648  SDValue V2 = SVOp->getOperand(1);
5649  DebugLoc dl = SVOp->getDebugLoc();
5650  SmallVector<int, 8> MaskVals;
5651
5652  // Determine if more than 1 of the words in each of the low and high quadwords
5653  // of the result come from the same quadword of one of the two inputs.  Undef
5654  // mask values count as coming from any quadword, for better codegen.
5655  unsigned LoQuad[] = { 0, 0, 0, 0 };
5656  unsigned HiQuad[] = { 0, 0, 0, 0 };
5657  std::bitset<4> InputQuads;
5658  for (unsigned i = 0; i < 8; ++i) {
5659    unsigned *Quad = i < 4 ? LoQuad : HiQuad;
5660    int EltIdx = SVOp->getMaskElt(i);
5661    MaskVals.push_back(EltIdx);
5662    if (EltIdx < 0) {
5663      ++Quad[0];
5664      ++Quad[1];
5665      ++Quad[2];
5666      ++Quad[3];
5667      continue;
5668    }
5669    ++Quad[EltIdx / 4];
5670    InputQuads.set(EltIdx / 4);
5671  }
5672
5673  int BestLoQuad = -1;
5674  unsigned MaxQuad = 1;
5675  for (unsigned i = 0; i < 4; ++i) {
5676    if (LoQuad[i] > MaxQuad) {
5677      BestLoQuad = i;
5678      MaxQuad = LoQuad[i];
5679    }
5680  }
5681
5682  int BestHiQuad = -1;
5683  MaxQuad = 1;
5684  for (unsigned i = 0; i < 4; ++i) {
5685    if (HiQuad[i] > MaxQuad) {
5686      BestHiQuad = i;
5687      MaxQuad = HiQuad[i];
5688    }
5689  }
5690
5691  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
5692  // of the two input vectors, shuffle them into one input vector so only a
5693  // single pshufb instruction is necessary. If There are more than 2 input
5694  // quads, disable the next transformation since it does not help SSSE3.
5695  bool V1Used = InputQuads[0] || InputQuads[1];
5696  bool V2Used = InputQuads[2] || InputQuads[3];
5697  if (Subtarget->hasSSSE3()) {
5698    if (InputQuads.count() == 2 && V1Used && V2Used) {
5699      BestLoQuad = InputQuads[0] ? 0 : 1;
5700      BestHiQuad = InputQuads[2] ? 2 : 3;
5701    }
5702    if (InputQuads.count() > 2) {
5703      BestLoQuad = -1;
5704      BestHiQuad = -1;
5705    }
5706  }
5707
5708  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
5709  // the shuffle mask.  If a quad is scored as -1, that means that it contains
5710  // words from all 4 input quadwords.
5711  SDValue NewV;
5712  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
5713    int MaskV[] = {
5714      BestLoQuad < 0 ? 0 : BestLoQuad,
5715      BestHiQuad < 0 ? 1 : BestHiQuad
5716    };
5717    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
5718                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
5719                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
5720    NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
5721
5722    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
5723    // source words for the shuffle, to aid later transformations.
5724    bool AllWordsInNewV = true;
5725    bool InOrder[2] = { true, true };
5726    for (unsigned i = 0; i != 8; ++i) {
5727      int idx = MaskVals[i];
5728      if (idx != (int)i)
5729        InOrder[i/4] = false;
5730      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
5731        continue;
5732      AllWordsInNewV = false;
5733      break;
5734    }
5735
5736    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
5737    if (AllWordsInNewV) {
5738      for (int i = 0; i != 8; ++i) {
5739        int idx = MaskVals[i];
5740        if (idx < 0)
5741          continue;
5742        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
5743        if ((idx != i) && idx < 4)
5744          pshufhw = false;
5745        if ((idx != i) && idx > 3)
5746          pshuflw = false;
5747      }
5748      V1 = NewV;
5749      V2Used = false;
5750      BestLoQuad = 0;
5751      BestHiQuad = 1;
5752    }
5753
5754    // If we've eliminated the use of V2, and the new mask is a pshuflw or
5755    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
5756    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
5757      unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
5758      unsigned TargetMask = 0;
5759      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
5760                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
5761      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
5762      TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
5763                             getShufflePSHUFLWImmediate(SVOp);
5764      V1 = NewV.getOperand(0);
5765      return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
5766    }
5767  }
5768
5769  // If we have SSSE3, and all words of the result are from 1 input vector,
5770  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
5771  // is present, fall back to case 4.
5772  if (Subtarget->hasSSSE3()) {
5773    SmallVector<SDValue,16> pshufbMask;
5774
5775    // If we have elements from both input vectors, set the high bit of the
5776    // shuffle mask element to zero out elements that come from V2 in the V1
5777    // mask, and elements that come from V1 in the V2 mask, so that the two
5778    // results can be OR'd together.
5779    bool TwoInputs = V1Used && V2Used;
5780    for (unsigned i = 0; i != 8; ++i) {
5781      int EltIdx = MaskVals[i] * 2;
5782      int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx;
5783      int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1;
5784      pshufbMask.push_back(DAG.getConstant(Idx0,   MVT::i8));
5785      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
5786    }
5787    V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
5788    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
5789                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5790                                 MVT::v16i8, &pshufbMask[0], 16));
5791    if (!TwoInputs)
5792      return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
5793
5794    // Calculate the shuffle mask for the second input, shuffle it, and
5795    // OR it with the first shuffled input.
5796    pshufbMask.clear();
5797    for (unsigned i = 0; i != 8; ++i) {
5798      int EltIdx = MaskVals[i] * 2;
5799      int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16;
5800      int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15;
5801      pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
5802      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
5803    }
5804    V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
5805    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
5806                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5807                                 MVT::v16i8, &pshufbMask[0], 16));
5808    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
5809    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
5810  }
5811
5812  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
5813  // and update MaskVals with new element order.
5814  std::bitset<8> InOrder;
5815  if (BestLoQuad >= 0) {
5816    int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
5817    for (int i = 0; i != 4; ++i) {
5818      int idx = MaskVals[i];
5819      if (idx < 0) {
5820        InOrder.set(i);
5821      } else if ((idx / 4) == BestLoQuad) {
5822        MaskV[i] = idx & 3;
5823        InOrder.set(i);
5824      }
5825    }
5826    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
5827                                &MaskV[0]);
5828
5829    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
5830      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
5831      NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
5832                                  NewV.getOperand(0),
5833                                  getShufflePSHUFLWImmediate(SVOp), DAG);
5834    }
5835  }
5836
5837  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
5838  // and update MaskVals with the new element order.
5839  if (BestHiQuad >= 0) {
5840    int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
5841    for (unsigned i = 4; i != 8; ++i) {
5842      int idx = MaskVals[i];
5843      if (idx < 0) {
5844        InOrder.set(i);
5845      } else if ((idx / 4) == BestHiQuad) {
5846        MaskV[i] = (idx & 3) + 4;
5847        InOrder.set(i);
5848      }
5849    }
5850    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
5851                                &MaskV[0]);
5852
5853    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
5854      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
5855      NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
5856                                  NewV.getOperand(0),
5857                                  getShufflePSHUFHWImmediate(SVOp), DAG);
5858    }
5859  }
5860
5861  // In case BestHi & BestLo were both -1, which means each quadword has a word
5862  // from each of the four input quadwords, calculate the InOrder bitvector now
5863  // before falling through to the insert/extract cleanup.
5864  if (BestLoQuad == -1 && BestHiQuad == -1) {
5865    NewV = V1;
5866    for (int i = 0; i != 8; ++i)
5867      if (MaskVals[i] < 0 || MaskVals[i] == i)
5868        InOrder.set(i);
5869  }
5870
5871  // The other elements are put in the right place using pextrw and pinsrw.
5872  for (unsigned i = 0; i != 8; ++i) {
5873    if (InOrder[i])
5874      continue;
5875    int EltIdx = MaskVals[i];
5876    if (EltIdx < 0)
5877      continue;
5878    SDValue ExtOp = (EltIdx < 8) ?
5879      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
5880                  DAG.getIntPtrConstant(EltIdx)) :
5881      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
5882                  DAG.getIntPtrConstant(EltIdx - 8));
5883    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
5884                       DAG.getIntPtrConstant(i));
5885  }
5886  return NewV;
5887}
5888
5889// v16i8 shuffles - Prefer shuffles in the following order:
5890// 1. [ssse3] 1 x pshufb
5891// 2. [ssse3] 2 x pshufb + 1 x por
5892// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
5893static
5894SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
5895                                 SelectionDAG &DAG,
5896                                 const X86TargetLowering &TLI) {
5897  SDValue V1 = SVOp->getOperand(0);
5898  SDValue V2 = SVOp->getOperand(1);
5899  DebugLoc dl = SVOp->getDebugLoc();
5900  ArrayRef<int> MaskVals = SVOp->getMask();
5901
5902  // If we have SSSE3, case 1 is generated when all result bytes come from
5903  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
5904  // present, fall back to case 3.
5905
5906  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
5907  if (TLI.getSubtarget()->hasSSSE3()) {
5908    SmallVector<SDValue,16> pshufbMask;
5909
5910    // If all result elements are from one input vector, then only translate
5911    // undef mask values to 0x80 (zero out result) in the pshufb mask.
5912    //
5913    // Otherwise, we have elements from both input vectors, and must zero out
5914    // elements that come from V2 in the first mask, and V1 in the second mask
5915    // so that we can OR them together.
5916    for (unsigned i = 0; i != 16; ++i) {
5917      int EltIdx = MaskVals[i];
5918      if (EltIdx < 0 || EltIdx >= 16)
5919        EltIdx = 0x80;
5920      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
5921    }
5922    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
5923                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5924                                 MVT::v16i8, &pshufbMask[0], 16));
5925
5926    // As PSHUFB will zero elements with negative indices, it's safe to ignore
5927    // the 2nd operand if it's undefined or zero.
5928    if (V2.getOpcode() == ISD::UNDEF ||
5929        ISD::isBuildVectorAllZeros(V2.getNode()))
5930      return V1;
5931
5932    // Calculate the shuffle mask for the second input, shuffle it, and
5933    // OR it with the first shuffled input.
5934    pshufbMask.clear();
5935    for (unsigned i = 0; i != 16; ++i) {
5936      int EltIdx = MaskVals[i];
5937      EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
5938      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
5939    }
5940    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
5941                     DAG.getNode(ISD::BUILD_VECTOR, dl,
5942                                 MVT::v16i8, &pshufbMask[0], 16));
5943    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
5944  }
5945
5946  // No SSSE3 - Calculate in place words and then fix all out of place words
5947  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
5948  // the 16 different words that comprise the two doublequadword input vectors.
5949  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
5950  V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
5951  SDValue NewV = V1;
5952  for (int i = 0; i != 8; ++i) {
5953    int Elt0 = MaskVals[i*2];
5954    int Elt1 = MaskVals[i*2+1];
5955
5956    // This word of the result is all undef, skip it.
5957    if (Elt0 < 0 && Elt1 < 0)
5958      continue;
5959
5960    // This word of the result is already in the correct place, skip it.
5961    if ((Elt0 == i*2) && (Elt1 == i*2+1))
5962      continue;
5963
5964    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
5965    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
5966    SDValue InsElt;
5967
5968    // If Elt0 and Elt1 are defined, are consecutive, and can be load
5969    // using a single extract together, load it and store it.
5970    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
5971      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
5972                           DAG.getIntPtrConstant(Elt1 / 2));
5973      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
5974                        DAG.getIntPtrConstant(i));
5975      continue;
5976    }
5977
5978    // If Elt1 is defined, extract it from the appropriate source.  If the
5979    // source byte is not also odd, shift the extracted word left 8 bits
5980    // otherwise clear the bottom 8 bits if we need to do an or.
5981    if (Elt1 >= 0) {
5982      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
5983                           DAG.getIntPtrConstant(Elt1 / 2));
5984      if ((Elt1 & 1) == 0)
5985        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
5986                             DAG.getConstant(8,
5987                                  TLI.getShiftAmountTy(InsElt.getValueType())));
5988      else if (Elt0 >= 0)
5989        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
5990                             DAG.getConstant(0xFF00, MVT::i16));
5991    }
5992    // If Elt0 is defined, extract it from the appropriate source.  If the
5993    // source byte is not also even, shift the extracted word right 8 bits. If
5994    // Elt1 was also defined, OR the extracted values together before
5995    // inserting them in the result.
5996    if (Elt0 >= 0) {
5997      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
5998                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
5999      if ((Elt0 & 1) != 0)
6000        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
6001                              DAG.getConstant(8,
6002                                 TLI.getShiftAmountTy(InsElt0.getValueType())));
6003      else if (Elt1 >= 0)
6004        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
6005                             DAG.getConstant(0x00FF, MVT::i16));
6006      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
6007                         : InsElt0;
6008    }
6009    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
6010                       DAG.getIntPtrConstant(i));
6011  }
6012  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
6013}
6014
6015// v32i8 shuffles - Translate to VPSHUFB if possible.
6016static
6017SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
6018                                 SelectionDAG &DAG,
6019                                 const X86TargetLowering &TLI) {
6020  EVT VT = SVOp->getValueType(0);
6021  SDValue V1 = SVOp->getOperand(0);
6022  SDValue V2 = SVOp->getOperand(1);
6023  DebugLoc dl = SVOp->getDebugLoc();
6024  ArrayRef<int> MaskVals = SVOp->getMask();
6025
6026  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
6027
6028  if (VT != MVT::v32i8 || !TLI.getSubtarget()->hasAVX2() || !V2IsUndef)
6029    return SDValue();
6030
6031  SmallVector<SDValue,32> pshufbMask;
6032  for (unsigned i = 0; i != 32; i++) {
6033    int EltIdx = MaskVals[i];
6034    if (EltIdx < 0 || EltIdx >= 32)
6035      EltIdx = 0x80;
6036    else {
6037      if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16))
6038        // Cross lane is not allowed.
6039        return SDValue();
6040      EltIdx &= 0xf;
6041    }
6042    pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
6043  }
6044  return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1,
6045                      DAG.getNode(ISD::BUILD_VECTOR, dl,
6046                                  MVT::v32i8, &pshufbMask[0], 32));
6047}
6048
6049/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
6050/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
6051/// done when every pair / quad of shuffle mask elements point to elements in
6052/// the right sequence. e.g.
6053/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
6054static
6055SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
6056                                 SelectionDAG &DAG, DebugLoc dl) {
6057  MVT VT = SVOp->getValueType(0).getSimpleVT();
6058  unsigned NumElems = VT.getVectorNumElements();
6059  MVT NewVT;
6060  unsigned Scale;
6061  switch (VT.SimpleTy) {
6062  default: llvm_unreachable("Unexpected!");
6063  case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
6064  case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
6065  case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
6066  case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
6067  case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
6068  case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
6069  }
6070
6071  SmallVector<int, 8> MaskVec;
6072  for (unsigned i = 0; i != NumElems; i += Scale) {
6073    int StartIdx = -1;
6074    for (unsigned j = 0; j != Scale; ++j) {
6075      int EltIdx = SVOp->getMaskElt(i+j);
6076      if (EltIdx < 0)
6077        continue;
6078      if (StartIdx < 0)
6079        StartIdx = (EltIdx / Scale);
6080      if (EltIdx != (int)(StartIdx*Scale + j))
6081        return SDValue();
6082    }
6083    MaskVec.push_back(StartIdx);
6084  }
6085
6086  SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
6087  SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
6088  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
6089}
6090
6091/// getVZextMovL - Return a zero-extending vector move low node.
6092///
6093static SDValue getVZextMovL(EVT VT, EVT OpVT,
6094                            SDValue SrcOp, SelectionDAG &DAG,
6095                            const X86Subtarget *Subtarget, DebugLoc dl) {
6096  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
6097    LoadSDNode *LD = NULL;
6098    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
6099      LD = dyn_cast<LoadSDNode>(SrcOp);
6100    if (!LD) {
6101      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
6102      // instead.
6103      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
6104      if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
6105          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6106          SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
6107          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
6108        // PR2108
6109        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
6110        return DAG.getNode(ISD::BITCAST, dl, VT,
6111                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
6112                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
6113                                                   OpVT,
6114                                                   SrcOp.getOperand(0)
6115                                                          .getOperand(0))));
6116      }
6117    }
6118  }
6119
6120  return DAG.getNode(ISD::BITCAST, dl, VT,
6121                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
6122                                 DAG.getNode(ISD::BITCAST, dl,
6123                                             OpVT, SrcOp)));
6124}
6125
6126/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
6127/// which could not be matched by any known target speficic shuffle
6128static SDValue
6129LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
6130
6131  SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
6132  if (NewOp.getNode())
6133    return NewOp;
6134
6135  EVT VT = SVOp->getValueType(0);
6136
6137  unsigned NumElems = VT.getVectorNumElements();
6138  unsigned NumLaneElems = NumElems / 2;
6139
6140  DebugLoc dl = SVOp->getDebugLoc();
6141  MVT EltVT = VT.getVectorElementType().getSimpleVT();
6142  EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
6143  SDValue Output[2];
6144
6145  SmallVector<int, 16> Mask;
6146  for (unsigned l = 0; l < 2; ++l) {
6147    // Build a shuffle mask for the output, discovering on the fly which
6148    // input vectors to use as shuffle operands (recorded in InputUsed).
6149    // If building a suitable shuffle vector proves too hard, then bail
6150    // out with UseBuildVector set.
6151    bool UseBuildVector = false;
6152    int InputUsed[2] = { -1, -1 }; // Not yet discovered.
6153    unsigned LaneStart = l * NumLaneElems;
6154    for (unsigned i = 0; i != NumLaneElems; ++i) {
6155      // The mask element.  This indexes into the input.
6156      int Idx = SVOp->getMaskElt(i+LaneStart);
6157      if (Idx < 0) {
6158        // the mask element does not index into any input vector.
6159        Mask.push_back(-1);
6160        continue;
6161      }
6162
6163      // The input vector this mask element indexes into.
6164      int Input = Idx / NumLaneElems;
6165
6166      // Turn the index into an offset from the start of the input vector.
6167      Idx -= Input * NumLaneElems;
6168
6169      // Find or create a shuffle vector operand to hold this input.
6170      unsigned OpNo;
6171      for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
6172        if (InputUsed[OpNo] == Input)
6173          // This input vector is already an operand.
6174          break;
6175        if (InputUsed[OpNo] < 0) {
6176          // Create a new operand for this input vector.
6177          InputUsed[OpNo] = Input;
6178          break;
6179        }
6180      }
6181
6182      if (OpNo >= array_lengthof(InputUsed)) {
6183        // More than two input vectors used!  Give up on trying to create a
6184        // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
6185        UseBuildVector = true;
6186        break;
6187      }
6188
6189      // Add the mask index for the new shuffle vector.
6190      Mask.push_back(Idx + OpNo * NumLaneElems);
6191    }
6192
6193    if (UseBuildVector) {
6194      SmallVector<SDValue, 16> SVOps;
6195      for (unsigned i = 0; i != NumLaneElems; ++i) {
6196        // The mask element.  This indexes into the input.
6197        int Idx = SVOp->getMaskElt(i+LaneStart);
6198        if (Idx < 0) {
6199          SVOps.push_back(DAG.getUNDEF(EltVT));
6200          continue;
6201        }
6202
6203        // The input vector this mask element indexes into.
6204        int Input = Idx / NumElems;
6205
6206        // Turn the index into an offset from the start of the input vector.
6207        Idx -= Input * NumElems;
6208
6209        // Extract the vector element by hand.
6210        SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
6211                                    SVOp->getOperand(Input),
6212                                    DAG.getIntPtrConstant(Idx)));
6213      }
6214
6215      // Construct the output using a BUILD_VECTOR.
6216      Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
6217                              SVOps.size());
6218    } else if (InputUsed[0] < 0) {
6219      // No input vectors were used! The result is undefined.
6220      Output[l] = DAG.getUNDEF(NVT);
6221    } else {
6222      SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
6223                                        (InputUsed[0] % 2) * NumLaneElems,
6224                                        DAG, dl);
6225      // If only one input was used, use an undefined vector for the other.
6226      SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
6227        Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
6228                            (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
6229      // At least one input vector was used. Create a new shuffle vector.
6230      Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
6231    }
6232
6233    Mask.clear();
6234  }
6235
6236  // Concatenate the result back
6237  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
6238}
6239
6240/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
6241/// 4 elements, and match them with several different shuffle types.
6242static SDValue
6243LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
6244  SDValue V1 = SVOp->getOperand(0);
6245  SDValue V2 = SVOp->getOperand(1);
6246  DebugLoc dl = SVOp->getDebugLoc();
6247  EVT VT = SVOp->getValueType(0);
6248
6249  assert(VT.is128BitVector() && "Unsupported vector size");
6250
6251  std::pair<int, int> Locs[4];
6252  int Mask1[] = { -1, -1, -1, -1 };
6253  SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
6254
6255  unsigned NumHi = 0;
6256  unsigned NumLo = 0;
6257  for (unsigned i = 0; i != 4; ++i) {
6258    int Idx = PermMask[i];
6259    if (Idx < 0) {
6260      Locs[i] = std::make_pair(-1, -1);
6261    } else {
6262      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
6263      if (Idx < 4) {
6264        Locs[i] = std::make_pair(0, NumLo);
6265        Mask1[NumLo] = Idx;
6266        NumLo++;
6267      } else {
6268        Locs[i] = std::make_pair(1, NumHi);
6269        if (2+NumHi < 4)
6270          Mask1[2+NumHi] = Idx;
6271        NumHi++;
6272      }
6273    }
6274  }
6275
6276  if (NumLo <= 2 && NumHi <= 2) {
6277    // If no more than two elements come from either vector. This can be
6278    // implemented with two shuffles. First shuffle gather the elements.
6279    // The second shuffle, which takes the first shuffle as both of its
6280    // vector operands, put the elements into the right order.
6281    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6282
6283    int Mask2[] = { -1, -1, -1, -1 };
6284
6285    for (unsigned i = 0; i != 4; ++i)
6286      if (Locs[i].first != -1) {
6287        unsigned Idx = (i < 2) ? 0 : 4;
6288        Idx += Locs[i].first * 2 + Locs[i].second;
6289        Mask2[i] = Idx;
6290      }
6291
6292    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
6293  }
6294
6295  if (NumLo == 3 || NumHi == 3) {
6296    // Otherwise, we must have three elements from one vector, call it X, and
6297    // one element from the other, call it Y.  First, use a shufps to build an
6298    // intermediate vector with the one element from Y and the element from X
6299    // that will be in the same half in the final destination (the indexes don't
6300    // matter). Then, use a shufps to build the final vector, taking the half
6301    // containing the element from Y from the intermediate, and the other half
6302    // from X.
6303    if (NumHi == 3) {
6304      // Normalize it so the 3 elements come from V1.
6305      CommuteVectorShuffleMask(PermMask, 4);
6306      std::swap(V1, V2);
6307    }
6308
6309    // Find the element from V2.
6310    unsigned HiIndex;
6311    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
6312      int Val = PermMask[HiIndex];
6313      if (Val < 0)
6314        continue;
6315      if (Val >= 4)
6316        break;
6317    }
6318
6319    Mask1[0] = PermMask[HiIndex];
6320    Mask1[1] = -1;
6321    Mask1[2] = PermMask[HiIndex^1];
6322    Mask1[3] = -1;
6323    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6324
6325    if (HiIndex >= 2) {
6326      Mask1[0] = PermMask[0];
6327      Mask1[1] = PermMask[1];
6328      Mask1[2] = HiIndex & 1 ? 6 : 4;
6329      Mask1[3] = HiIndex & 1 ? 4 : 6;
6330      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
6331    }
6332
6333    Mask1[0] = HiIndex & 1 ? 2 : 0;
6334    Mask1[1] = HiIndex & 1 ? 0 : 2;
6335    Mask1[2] = PermMask[2];
6336    Mask1[3] = PermMask[3];
6337    if (Mask1[2] >= 0)
6338      Mask1[2] += 4;
6339    if (Mask1[3] >= 0)
6340      Mask1[3] += 4;
6341    return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
6342  }
6343
6344  // Break it into (shuffle shuffle_hi, shuffle_lo).
6345  int LoMask[] = { -1, -1, -1, -1 };
6346  int HiMask[] = { -1, -1, -1, -1 };
6347
6348  int *MaskPtr = LoMask;
6349  unsigned MaskIdx = 0;
6350  unsigned LoIdx = 0;
6351  unsigned HiIdx = 2;
6352  for (unsigned i = 0; i != 4; ++i) {
6353    if (i == 2) {
6354      MaskPtr = HiMask;
6355      MaskIdx = 1;
6356      LoIdx = 0;
6357      HiIdx = 2;
6358    }
6359    int Idx = PermMask[i];
6360    if (Idx < 0) {
6361      Locs[i] = std::make_pair(-1, -1);
6362    } else if (Idx < 4) {
6363      Locs[i] = std::make_pair(MaskIdx, LoIdx);
6364      MaskPtr[LoIdx] = Idx;
6365      LoIdx++;
6366    } else {
6367      Locs[i] = std::make_pair(MaskIdx, HiIdx);
6368      MaskPtr[HiIdx] = Idx;
6369      HiIdx++;
6370    }
6371  }
6372
6373  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
6374  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
6375  int MaskOps[] = { -1, -1, -1, -1 };
6376  for (unsigned i = 0; i != 4; ++i)
6377    if (Locs[i].first != -1)
6378      MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
6379  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
6380}
6381
6382static bool MayFoldVectorLoad(SDValue V) {
6383  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
6384    V = V.getOperand(0);
6385  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6386    V = V.getOperand(0);
6387  if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
6388      V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
6389    // BUILD_VECTOR (load), undef
6390    V = V.getOperand(0);
6391  if (MayFoldLoad(V))
6392    return true;
6393  return false;
6394}
6395
6396// FIXME: the version above should always be used. Since there's
6397// a bug where several vector shuffles can't be folded because the
6398// DAG is not updated during lowering and a node claims to have two
6399// uses while it only has one, use this version, and let isel match
6400// another instruction if the load really happens to have more than
6401// one use. Remove this version after this bug get fixed.
6402// rdar://8434668, PR8156
6403static bool RelaxedMayFoldVectorLoad(SDValue V) {
6404  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
6405    V = V.getOperand(0);
6406  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6407    V = V.getOperand(0);
6408  if (ISD::isNormalLoad(V.getNode()))
6409    return true;
6410  return false;
6411}
6412
6413static
6414SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
6415  EVT VT = Op.getValueType();
6416
6417  // Canonizalize to v2f64.
6418  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
6419  return DAG.getNode(ISD::BITCAST, dl, VT,
6420                     getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
6421                                          V1, DAG));
6422}
6423
6424static
6425SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
6426                        bool HasSSE2) {
6427  SDValue V1 = Op.getOperand(0);
6428  SDValue V2 = Op.getOperand(1);
6429  EVT VT = Op.getValueType();
6430
6431  assert(VT != MVT::v2i64 && "unsupported shuffle type");
6432
6433  if (HasSSE2 && VT == MVT::v2f64)
6434    return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
6435
6436  // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
6437  return DAG.getNode(ISD::BITCAST, dl, VT,
6438                     getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
6439                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
6440                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
6441}
6442
6443static
6444SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
6445  SDValue V1 = Op.getOperand(0);
6446  SDValue V2 = Op.getOperand(1);
6447  EVT VT = Op.getValueType();
6448
6449  assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
6450         "unsupported shuffle type");
6451
6452  if (V2.getOpcode() == ISD::UNDEF)
6453    V2 = V1;
6454
6455  // v4i32 or v4f32
6456  return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
6457}
6458
6459static
6460SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
6461  SDValue V1 = Op.getOperand(0);
6462  SDValue V2 = Op.getOperand(1);
6463  EVT VT = Op.getValueType();
6464  unsigned NumElems = VT.getVectorNumElements();
6465
6466  // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
6467  // operand of these instructions is only memory, so check if there's a
6468  // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
6469  // same masks.
6470  bool CanFoldLoad = false;
6471
6472  // Trivial case, when V2 comes from a load.
6473  if (MayFoldVectorLoad(V2))
6474    CanFoldLoad = true;
6475
6476  // When V1 is a load, it can be folded later into a store in isel, example:
6477  //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
6478  //    turns into:
6479  //  (MOVLPSmr addr:$src1, VR128:$src2)
6480  // So, recognize this potential and also use MOVLPS or MOVLPD
6481  else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
6482    CanFoldLoad = true;
6483
6484  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6485  if (CanFoldLoad) {
6486    if (HasSSE2 && NumElems == 2)
6487      return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
6488
6489    if (NumElems == 4)
6490      // If we don't care about the second element, proceed to use movss.
6491      if (SVOp->getMaskElt(1) != -1)
6492        return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
6493  }
6494
6495  // movl and movlp will both match v2i64, but v2i64 is never matched by
6496  // movl earlier because we make it strict to avoid messing with the movlp load
6497  // folding logic (see the code above getMOVLP call). Match it here then,
6498  // this is horrible, but will stay like this until we move all shuffle
6499  // matching to x86 specific nodes. Note that for the 1st condition all
6500  // types are matched with movsd.
6501  if (HasSSE2) {
6502    // FIXME: isMOVLMask should be checked and matched before getMOVLP,
6503    // as to remove this logic from here, as much as possible
6504    if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
6505      return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
6506    return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
6507  }
6508
6509  assert(VT != MVT::v4i32 && "unsupported shuffle type");
6510
6511  // Invert the operand order and use SHUFPS to match it.
6512  return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
6513                              getShuffleSHUFImmediate(SVOp), DAG);
6514}
6515
6516SDValue
6517X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
6518  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6519  EVT VT = Op.getValueType();
6520  DebugLoc dl = Op.getDebugLoc();
6521  SDValue V1 = Op.getOperand(0);
6522  SDValue V2 = Op.getOperand(1);
6523
6524  if (isZeroShuffle(SVOp))
6525    return getZeroVector(VT, Subtarget, DAG, dl);
6526
6527  // Handle splat operations
6528  if (SVOp->isSplat()) {
6529    unsigned NumElem = VT.getVectorNumElements();
6530    int Size = VT.getSizeInBits();
6531
6532    // Use vbroadcast whenever the splat comes from a foldable load
6533    SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
6534    if (Broadcast.getNode())
6535      return Broadcast;
6536
6537    // Handle splats by matching through known shuffle masks
6538    if ((Size == 128 && NumElem <= 4) ||
6539        (Size == 256 && NumElem < 8))
6540      return SDValue();
6541
6542    // All remaning splats are promoted to target supported vector shuffles.
6543    return PromoteSplat(SVOp, DAG);
6544  }
6545
6546  // If the shuffle can be profitably rewritten as a narrower shuffle, then
6547  // do it!
6548  if (VT == MVT::v8i16  || VT == MVT::v16i8 ||
6549      VT == MVT::v16i16 || VT == MVT::v32i8) {
6550    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
6551    if (NewOp.getNode())
6552      return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
6553  } else if ((VT == MVT::v4i32 ||
6554             (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
6555    // FIXME: Figure out a cleaner way to do this.
6556    // Try to make use of movq to zero out the top part.
6557    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
6558      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
6559      if (NewOp.getNode()) {
6560        EVT NewVT = NewOp.getValueType();
6561        if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
6562                               NewVT, true, false))
6563          return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
6564                              DAG, Subtarget, dl);
6565      }
6566    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
6567      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
6568      if (NewOp.getNode()) {
6569        EVT NewVT = NewOp.getValueType();
6570        if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
6571          return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
6572                              DAG, Subtarget, dl);
6573      }
6574    }
6575  }
6576  return SDValue();
6577}
6578
6579SDValue
6580X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
6581  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6582  SDValue V1 = Op.getOperand(0);
6583  SDValue V2 = Op.getOperand(1);
6584  EVT VT = Op.getValueType();
6585  DebugLoc dl = Op.getDebugLoc();
6586  unsigned NumElems = VT.getVectorNumElements();
6587  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
6588  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
6589  bool V1IsSplat = false;
6590  bool V2IsSplat = false;
6591  bool HasSSE2 = Subtarget->hasSSE2();
6592  bool HasAVX    = Subtarget->hasAVX();
6593  bool HasAVX2   = Subtarget->hasAVX2();
6594  MachineFunction &MF = DAG.getMachineFunction();
6595  bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
6596
6597  assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
6598
6599  if (V1IsUndef && V2IsUndef)
6600    return DAG.getUNDEF(VT);
6601
6602  assert(!V1IsUndef && "Op 1 of shuffle should not be undef");
6603
6604  // Vector shuffle lowering takes 3 steps:
6605  //
6606  // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
6607  //    narrowing and commutation of operands should be handled.
6608  // 2) Matching of shuffles with known shuffle masks to x86 target specific
6609  //    shuffle nodes.
6610  // 3) Rewriting of unmatched masks into new generic shuffle operations,
6611  //    so the shuffle can be broken into other shuffles and the legalizer can
6612  //    try the lowering again.
6613  //
6614  // The general idea is that no vector_shuffle operation should be left to
6615  // be matched during isel, all of them must be converted to a target specific
6616  // node here.
6617
6618  // Normalize the input vectors. Here splats, zeroed vectors, profitable
6619  // narrowing and commutation of operands should be handled. The actual code
6620  // doesn't include all of those, work in progress...
6621  SDValue NewOp = NormalizeVectorShuffle(Op, DAG);
6622  if (NewOp.getNode())
6623    return NewOp;
6624
6625  SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
6626
6627  // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
6628  // unpckh_undef). Only use pshufd if speed is more important than size.
6629  if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasAVX2))
6630    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
6631  if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasAVX2))
6632    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
6633
6634  if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
6635      V2IsUndef && RelaxedMayFoldVectorLoad(V1))
6636    return getMOVDDup(Op, dl, V1, DAG);
6637
6638  if (isMOVHLPS_v_undef_Mask(M, VT))
6639    return getMOVHighToLow(Op, dl, DAG);
6640
6641  // Use to match splats
6642  if (HasSSE2 && isUNPCKHMask(M, VT, HasAVX2) && V2IsUndef &&
6643      (VT == MVT::v2f64 || VT == MVT::v2i64))
6644    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
6645
6646  if (isPSHUFDMask(M, VT)) {
6647    // The actual implementation will match the mask in the if above and then
6648    // during isel it can match several different instructions, not only pshufd
6649    // as its name says, sad but true, emulate the behavior for now...
6650    if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
6651      return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
6652
6653    unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
6654
6655    if (HasAVX && (VT == MVT::v4f32 || VT == MVT::v2f64))
6656      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, DAG);
6657
6658    if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
6659      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
6660
6661    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
6662                                TargetMask, DAG);
6663  }
6664
6665  // Check if this can be converted into a logical shift.
6666  bool isLeft = false;
6667  unsigned ShAmt = 0;
6668  SDValue ShVal;
6669  bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
6670  if (isShift && ShVal.hasOneUse()) {
6671    // If the shifted value has multiple uses, it may be cheaper to use
6672    // v_set0 + movlhps or movhlps, etc.
6673    EVT EltVT = VT.getVectorElementType();
6674    ShAmt *= EltVT.getSizeInBits();
6675    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
6676  }
6677
6678  if (isMOVLMask(M, VT)) {
6679    if (ISD::isBuildVectorAllZeros(V1.getNode()))
6680      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
6681    if (!isMOVLPMask(M, VT)) {
6682      if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
6683        return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
6684
6685      if (VT == MVT::v4i32 || VT == MVT::v4f32)
6686        return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
6687    }
6688  }
6689
6690  // FIXME: fold these into legal mask.
6691  if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasAVX2))
6692    return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
6693
6694  if (isMOVHLPSMask(M, VT))
6695    return getMOVHighToLow(Op, dl, DAG);
6696
6697  if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
6698    return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
6699
6700  if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
6701    return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
6702
6703  if (isMOVLPMask(M, VT))
6704    return getMOVLP(Op, dl, DAG, HasSSE2);
6705
6706  if (ShouldXformToMOVHLPS(M, VT) ||
6707      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
6708    return CommuteVectorShuffle(SVOp, DAG);
6709
6710  if (isShift) {
6711    // No better options. Use a vshldq / vsrldq.
6712    EVT EltVT = VT.getVectorElementType();
6713    ShAmt *= EltVT.getSizeInBits();
6714    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
6715  }
6716
6717  bool Commuted = false;
6718  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
6719  // 1,1,1,1 -> v8i16 though.
6720  V1IsSplat = isSplatVector(V1.getNode());
6721  V2IsSplat = isSplatVector(V2.getNode());
6722
6723  // Canonicalize the splat or undef, if present, to be on the RHS.
6724  if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
6725    CommuteVectorShuffleMask(M, NumElems);
6726    std::swap(V1, V2);
6727    std::swap(V1IsSplat, V2IsSplat);
6728    Commuted = true;
6729  }
6730
6731  if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
6732    // Shuffling low element of v1 into undef, just return v1.
6733    if (V2IsUndef)
6734      return V1;
6735    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
6736    // the instruction selector will not match, so get a canonical MOVL with
6737    // swapped operands to undo the commute.
6738    return getMOVL(DAG, dl, VT, V2, V1);
6739  }
6740
6741  if (isUNPCKLMask(M, VT, HasAVX2))
6742    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
6743
6744  if (isUNPCKHMask(M, VT, HasAVX2))
6745    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
6746
6747  if (V2IsSplat) {
6748    // Normalize mask so all entries that point to V2 points to its first
6749    // element then try to match unpck{h|l} again. If match, return a
6750    // new vector_shuffle with the corrected mask.p
6751    SmallVector<int, 8> NewMask(M.begin(), M.end());
6752    NormalizeMask(NewMask, NumElems);
6753    if (isUNPCKLMask(NewMask, VT, HasAVX2, true))
6754      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
6755    if (isUNPCKHMask(NewMask, VT, HasAVX2, true))
6756      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
6757  }
6758
6759  if (Commuted) {
6760    // Commute is back and try unpck* again.
6761    // FIXME: this seems wrong.
6762    CommuteVectorShuffleMask(M, NumElems);
6763    std::swap(V1, V2);
6764    std::swap(V1IsSplat, V2IsSplat);
6765    Commuted = false;
6766
6767    if (isUNPCKLMask(M, VT, HasAVX2))
6768      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
6769
6770    if (isUNPCKHMask(M, VT, HasAVX2))
6771      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
6772  }
6773
6774  // Normalize the node to match x86 shuffle ops if needed
6775  if (!V2IsUndef && (isSHUFPMask(M, VT, HasAVX, /* Commuted */ true)))
6776    return CommuteVectorShuffle(SVOp, DAG);
6777
6778  // The checks below are all present in isShuffleMaskLegal, but they are
6779  // inlined here right now to enable us to directly emit target specific
6780  // nodes, and remove one by one until they don't return Op anymore.
6781
6782  if (isPALIGNRMask(M, VT, Subtarget))
6783    return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
6784                                getShufflePALIGNRImmediate(SVOp),
6785                                DAG);
6786
6787  if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
6788      SVOp->getSplatIndex() == 0 && V2IsUndef) {
6789    if (VT == MVT::v2f64 || VT == MVT::v2i64)
6790      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
6791  }
6792
6793  if (isPSHUFHWMask(M, VT, HasAVX2))
6794    return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
6795                                getShufflePSHUFHWImmediate(SVOp),
6796                                DAG);
6797
6798  if (isPSHUFLWMask(M, VT, HasAVX2))
6799    return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
6800                                getShufflePSHUFLWImmediate(SVOp),
6801                                DAG);
6802
6803  if (isSHUFPMask(M, VT, HasAVX))
6804    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
6805                                getShuffleSHUFImmediate(SVOp), DAG);
6806
6807  if (isUNPCKL_v_undef_Mask(M, VT, HasAVX2))
6808    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
6809  if (isUNPCKH_v_undef_Mask(M, VT, HasAVX2))
6810    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
6811
6812  //===--------------------------------------------------------------------===//
6813  // Generate target specific nodes for 128 or 256-bit shuffles only
6814  // supported in the AVX instruction set.
6815  //
6816
6817  // Handle VMOVDDUPY permutations
6818  if (V2IsUndef && isMOVDDUPYMask(M, VT, HasAVX))
6819    return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
6820
6821  // Handle VPERMILPS/D* permutations
6822  if (isVPERMILPMask(M, VT, HasAVX)) {
6823    if (HasAVX2 && VT == MVT::v8i32)
6824      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
6825                                  getShuffleSHUFImmediate(SVOp), DAG);
6826    return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
6827                                getShuffleSHUFImmediate(SVOp), DAG);
6828  }
6829
6830  // Handle VPERM2F128/VPERM2I128 permutations
6831  if (isVPERM2X128Mask(M, VT, HasAVX))
6832    return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
6833                                V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
6834
6835  SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG);
6836  if (BlendOp.getNode())
6837    return BlendOp;
6838
6839  if (V2IsUndef && HasAVX2 && (VT == MVT::v8i32 || VT == MVT::v8f32)) {
6840    SmallVector<SDValue, 8> permclMask;
6841    for (unsigned i = 0; i != 8; ++i) {
6842      permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32));
6843    }
6844    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32,
6845                               &permclMask[0], 8);
6846    // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
6847    return DAG.getNode(X86ISD::VPERMV, dl, VT,
6848                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
6849  }
6850
6851  if (V2IsUndef && HasAVX2 && (VT == MVT::v4i64 || VT == MVT::v4f64))
6852    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
6853                                getShuffleCLImmediate(SVOp), DAG);
6854
6855
6856  //===--------------------------------------------------------------------===//
6857  // Since no target specific shuffle was selected for this generic one,
6858  // lower it into other known shuffles. FIXME: this isn't true yet, but
6859  // this is the plan.
6860  //
6861
6862  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
6863  if (VT == MVT::v8i16) {
6864    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG);
6865    if (NewOp.getNode())
6866      return NewOp;
6867  }
6868
6869  if (VT == MVT::v16i8) {
6870    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
6871    if (NewOp.getNode())
6872      return NewOp;
6873  }
6874
6875  if (VT == MVT::v32i8) {
6876    SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, DAG, *this);
6877    if (NewOp.getNode())
6878      return NewOp;
6879  }
6880
6881  // Handle all 128-bit wide vectors with 4 elements, and match them with
6882  // several different shuffle types.
6883  if (NumElems == 4 && VT.is128BitVector())
6884    return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
6885
6886  // Handle general 256-bit shuffles
6887  if (VT.is256BitVector())
6888    return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
6889
6890  return SDValue();
6891}
6892
6893SDValue
6894X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
6895                                                SelectionDAG &DAG) const {
6896  EVT VT = Op.getValueType();
6897  DebugLoc dl = Op.getDebugLoc();
6898
6899  if (!Op.getOperand(0).getValueType().is128BitVector())
6900    return SDValue();
6901
6902  if (VT.getSizeInBits() == 8) {
6903    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
6904                                    Op.getOperand(0), Op.getOperand(1));
6905    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
6906                                    DAG.getValueType(VT));
6907    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
6908  }
6909
6910  if (VT.getSizeInBits() == 16) {
6911    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
6912    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
6913    if (Idx == 0)
6914      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
6915                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
6916                                     DAG.getNode(ISD::BITCAST, dl,
6917                                                 MVT::v4i32,
6918                                                 Op.getOperand(0)),
6919                                     Op.getOperand(1)));
6920    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
6921                                    Op.getOperand(0), Op.getOperand(1));
6922    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
6923                                    DAG.getValueType(VT));
6924    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
6925  }
6926
6927  if (VT == MVT::f32) {
6928    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
6929    // the result back to FR32 register. It's only worth matching if the
6930    // result has a single use which is a store or a bitcast to i32.  And in
6931    // the case of a store, it's not worth it if the index is a constant 0,
6932    // because a MOVSSmr can be used instead, which is smaller and faster.
6933    if (!Op.hasOneUse())
6934      return SDValue();
6935    SDNode *User = *Op.getNode()->use_begin();
6936    if ((User->getOpcode() != ISD::STORE ||
6937         (isa<ConstantSDNode>(Op.getOperand(1)) &&
6938          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
6939        (User->getOpcode() != ISD::BITCAST ||
6940         User->getValueType(0) != MVT::i32))
6941      return SDValue();
6942    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
6943                                  DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
6944                                              Op.getOperand(0)),
6945                                              Op.getOperand(1));
6946    return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
6947  }
6948
6949  if (VT == MVT::i32 || VT == MVT::i64) {
6950    // ExtractPS/pextrq works with constant index.
6951    if (isa<ConstantSDNode>(Op.getOperand(1)))
6952      return Op;
6953  }
6954  return SDValue();
6955}
6956
6957
6958SDValue
6959X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
6960                                           SelectionDAG &DAG) const {
6961  if (!isa<ConstantSDNode>(Op.getOperand(1)))
6962    return SDValue();
6963
6964  SDValue Vec = Op.getOperand(0);
6965  EVT VecVT = Vec.getValueType();
6966
6967  // If this is a 256-bit vector result, first extract the 128-bit vector and
6968  // then extract the element from the 128-bit vector.
6969  if (VecVT.is256BitVector()) {
6970    DebugLoc dl = Op.getNode()->getDebugLoc();
6971    unsigned NumElems = VecVT.getVectorNumElements();
6972    SDValue Idx = Op.getOperand(1);
6973    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
6974
6975    // Get the 128-bit vector.
6976    Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
6977
6978    if (IdxVal >= NumElems/2)
6979      IdxVal -= NumElems/2;
6980    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
6981                       DAG.getConstant(IdxVal, MVT::i32));
6982  }
6983
6984  assert(VecVT.is128BitVector() && "Unexpected vector length");
6985
6986  if (Subtarget->hasSSE41()) {
6987    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
6988    if (Res.getNode())
6989      return Res;
6990  }
6991
6992  EVT VT = Op.getValueType();
6993  DebugLoc dl = Op.getDebugLoc();
6994  // TODO: handle v16i8.
6995  if (VT.getSizeInBits() == 16) {
6996    SDValue Vec = Op.getOperand(0);
6997    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
6998    if (Idx == 0)
6999      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
7000                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
7001                                     DAG.getNode(ISD::BITCAST, dl,
7002                                                 MVT::v4i32, Vec),
7003                                     Op.getOperand(1)));
7004    // Transform it so it match pextrw which produces a 32-bit result.
7005    EVT EltVT = MVT::i32;
7006    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
7007                                    Op.getOperand(0), Op.getOperand(1));
7008    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
7009                                    DAG.getValueType(VT));
7010    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
7011  }
7012
7013  if (VT.getSizeInBits() == 32) {
7014    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7015    if (Idx == 0)
7016      return Op;
7017
7018    // SHUFPS the element to the lowest double word, then movss.
7019    int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
7020    EVT VVT = Op.getOperand(0).getValueType();
7021    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
7022                                       DAG.getUNDEF(VVT), Mask);
7023    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
7024                       DAG.getIntPtrConstant(0));
7025  }
7026
7027  if (VT.getSizeInBits() == 64) {
7028    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
7029    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
7030    //        to match extract_elt for f64.
7031    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7032    if (Idx == 0)
7033      return Op;
7034
7035    // UNPCKHPD the element to the lowest double word, then movsd.
7036    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
7037    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
7038    int Mask[2] = { 1, -1 };
7039    EVT VVT = Op.getOperand(0).getValueType();
7040    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
7041                                       DAG.getUNDEF(VVT), Mask);
7042    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
7043                       DAG.getIntPtrConstant(0));
7044  }
7045
7046  return SDValue();
7047}
7048
7049SDValue
7050X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
7051                                               SelectionDAG &DAG) const {
7052  EVT VT = Op.getValueType();
7053  EVT EltVT = VT.getVectorElementType();
7054  DebugLoc dl = Op.getDebugLoc();
7055
7056  SDValue N0 = Op.getOperand(0);
7057  SDValue N1 = Op.getOperand(1);
7058  SDValue N2 = Op.getOperand(2);
7059
7060  if (!VT.is128BitVector())
7061    return SDValue();
7062
7063  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
7064      isa<ConstantSDNode>(N2)) {
7065    unsigned Opc;
7066    if (VT == MVT::v8i16)
7067      Opc = X86ISD::PINSRW;
7068    else if (VT == MVT::v16i8)
7069      Opc = X86ISD::PINSRB;
7070    else
7071      Opc = X86ISD::PINSRB;
7072
7073    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
7074    // argument.
7075    if (N1.getValueType() != MVT::i32)
7076      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
7077    if (N2.getValueType() != MVT::i32)
7078      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
7079    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
7080  }
7081
7082  if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
7083    // Bits [7:6] of the constant are the source select.  This will always be
7084    //  zero here.  The DAG Combiner may combine an extract_elt index into these
7085    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
7086    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
7087    // Bits [5:4] of the constant are the destination select.  This is the
7088    //  value of the incoming immediate.
7089    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
7090    //   combine either bitwise AND or insert of float 0.0 to set these bits.
7091    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
7092    // Create this as a scalar to vector..
7093    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
7094    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
7095  }
7096
7097  if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
7098    // PINSR* works with constant index.
7099    return Op;
7100  }
7101  return SDValue();
7102}
7103
7104SDValue
7105X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
7106  EVT VT = Op.getValueType();
7107  EVT EltVT = VT.getVectorElementType();
7108
7109  DebugLoc dl = Op.getDebugLoc();
7110  SDValue N0 = Op.getOperand(0);
7111  SDValue N1 = Op.getOperand(1);
7112  SDValue N2 = Op.getOperand(2);
7113
7114  // If this is a 256-bit vector result, first extract the 128-bit vector,
7115  // insert the element into the extracted half and then place it back.
7116  if (VT.is256BitVector()) {
7117    if (!isa<ConstantSDNode>(N2))
7118      return SDValue();
7119
7120    // Get the desired 128-bit vector half.
7121    unsigned NumElems = VT.getVectorNumElements();
7122    unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
7123    SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
7124
7125    // Insert the element into the desired half.
7126    bool Upper = IdxVal >= NumElems/2;
7127    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
7128                 DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32));
7129
7130    // Insert the changed part back to the 256-bit vector
7131    return Insert128BitVector(N0, V, IdxVal, DAG, dl);
7132  }
7133
7134  if (Subtarget->hasSSE41())
7135    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
7136
7137  if (EltVT == MVT::i8)
7138    return SDValue();
7139
7140  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
7141    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
7142    // as its second argument.
7143    if (N1.getValueType() != MVT::i32)
7144      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
7145    if (N2.getValueType() != MVT::i32)
7146      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
7147    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
7148  }
7149  return SDValue();
7150}
7151
7152SDValue
7153X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7154  LLVMContext *Context = DAG.getContext();
7155  DebugLoc dl = Op.getDebugLoc();
7156  EVT OpVT = Op.getValueType();
7157
7158  // If this is a 256-bit vector result, first insert into a 128-bit
7159  // vector and then insert into the 256-bit vector.
7160  if (!OpVT.is128BitVector()) {
7161    // Insert into a 128-bit vector.
7162    EVT VT128 = EVT::getVectorVT(*Context,
7163                                 OpVT.getVectorElementType(),
7164                                 OpVT.getVectorNumElements() / 2);
7165
7166    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
7167
7168    // Insert the 128-bit vector.
7169    return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
7170  }
7171
7172  if (OpVT == MVT::v1i64 &&
7173      Op.getOperand(0).getValueType() == MVT::i64)
7174    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
7175
7176  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
7177  assert(OpVT.is128BitVector() && "Expected an SSE type!");
7178  return DAG.getNode(ISD::BITCAST, dl, OpVT,
7179                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
7180}
7181
7182// Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
7183// a simple subregister reference or explicit instructions to grab
7184// upper bits of a vector.
7185SDValue
7186X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
7187  if (Subtarget->hasAVX()) {
7188    DebugLoc dl = Op.getNode()->getDebugLoc();
7189    SDValue Vec = Op.getNode()->getOperand(0);
7190    SDValue Idx = Op.getNode()->getOperand(1);
7191
7192    if (Op.getNode()->getValueType(0).is128BitVector() &&
7193        Vec.getNode()->getValueType(0).is256BitVector() &&
7194        isa<ConstantSDNode>(Idx)) {
7195      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7196      return Extract128BitVector(Vec, IdxVal, DAG, dl);
7197    }
7198  }
7199  return SDValue();
7200}
7201
7202// Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
7203// simple superregister reference or explicit instructions to insert
7204// the upper bits of a vector.
7205SDValue
7206X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
7207  if (Subtarget->hasAVX()) {
7208    DebugLoc dl = Op.getNode()->getDebugLoc();
7209    SDValue Vec = Op.getNode()->getOperand(0);
7210    SDValue SubVec = Op.getNode()->getOperand(1);
7211    SDValue Idx = Op.getNode()->getOperand(2);
7212
7213    if (Op.getNode()->getValueType(0).is256BitVector() &&
7214        SubVec.getNode()->getValueType(0).is128BitVector() &&
7215        isa<ConstantSDNode>(Idx)) {
7216      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
7217      return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
7218    }
7219  }
7220  return SDValue();
7221}
7222
7223// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
7224// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
7225// one of the above mentioned nodes. It has to be wrapped because otherwise
7226// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
7227// be used to form addressing mode. These wrapped nodes will be selected
7228// into MOV32ri.
7229SDValue
7230X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
7231  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
7232
7233  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7234  // global base reg.
7235  unsigned char OpFlag = 0;
7236  unsigned WrapperKind = X86ISD::Wrapper;
7237  CodeModel::Model M = getTargetMachine().getCodeModel();
7238
7239  if (Subtarget->isPICStyleRIPRel() &&
7240      (M == CodeModel::Small || M == CodeModel::Kernel))
7241    WrapperKind = X86ISD::WrapperRIP;
7242  else if (Subtarget->isPICStyleGOT())
7243    OpFlag = X86II::MO_GOTOFF;
7244  else if (Subtarget->isPICStyleStubPIC())
7245    OpFlag = X86II::MO_PIC_BASE_OFFSET;
7246
7247  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
7248                                             CP->getAlignment(),
7249                                             CP->getOffset(), OpFlag);
7250  DebugLoc DL = CP->getDebugLoc();
7251  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7252  // With PIC, the address is actually $g + Offset.
7253  if (OpFlag) {
7254    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7255                         DAG.getNode(X86ISD::GlobalBaseReg,
7256                                     DebugLoc(), getPointerTy()),
7257                         Result);
7258  }
7259
7260  return Result;
7261}
7262
7263SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
7264  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
7265
7266  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7267  // global base reg.
7268  unsigned char OpFlag = 0;
7269  unsigned WrapperKind = X86ISD::Wrapper;
7270  CodeModel::Model M = getTargetMachine().getCodeModel();
7271
7272  if (Subtarget->isPICStyleRIPRel() &&
7273      (M == CodeModel::Small || M == CodeModel::Kernel))
7274    WrapperKind = X86ISD::WrapperRIP;
7275  else if (Subtarget->isPICStyleGOT())
7276    OpFlag = X86II::MO_GOTOFF;
7277  else if (Subtarget->isPICStyleStubPIC())
7278    OpFlag = X86II::MO_PIC_BASE_OFFSET;
7279
7280  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
7281                                          OpFlag);
7282  DebugLoc DL = JT->getDebugLoc();
7283  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7284
7285  // With PIC, the address is actually $g + Offset.
7286  if (OpFlag)
7287    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7288                         DAG.getNode(X86ISD::GlobalBaseReg,
7289                                     DebugLoc(), getPointerTy()),
7290                         Result);
7291
7292  return Result;
7293}
7294
7295SDValue
7296X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
7297  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
7298
7299  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7300  // global base reg.
7301  unsigned char OpFlag = 0;
7302  unsigned WrapperKind = X86ISD::Wrapper;
7303  CodeModel::Model M = getTargetMachine().getCodeModel();
7304
7305  if (Subtarget->isPICStyleRIPRel() &&
7306      (M == CodeModel::Small || M == CodeModel::Kernel)) {
7307    if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
7308      OpFlag = X86II::MO_GOTPCREL;
7309    WrapperKind = X86ISD::WrapperRIP;
7310  } else if (Subtarget->isPICStyleGOT()) {
7311    OpFlag = X86II::MO_GOT;
7312  } else if (Subtarget->isPICStyleStubPIC()) {
7313    OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
7314  } else if (Subtarget->isPICStyleStubNoDynamic()) {
7315    OpFlag = X86II::MO_DARWIN_NONLAZY;
7316  }
7317
7318  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
7319
7320  DebugLoc DL = Op.getDebugLoc();
7321  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7322
7323
7324  // With PIC, the address is actually $g + Offset.
7325  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
7326      !Subtarget->is64Bit()) {
7327    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7328                         DAG.getNode(X86ISD::GlobalBaseReg,
7329                                     DebugLoc(), getPointerTy()),
7330                         Result);
7331  }
7332
7333  // For symbols that require a load from a stub to get the address, emit the
7334  // load.
7335  if (isGlobalStubReference(OpFlag))
7336    Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
7337                         MachinePointerInfo::getGOT(), false, false, false, 0);
7338
7339  return Result;
7340}
7341
7342SDValue
7343X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
7344  // Create the TargetBlockAddressAddress node.
7345  unsigned char OpFlags =
7346    Subtarget->ClassifyBlockAddressReference();
7347  CodeModel::Model M = getTargetMachine().getCodeModel();
7348  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
7349  DebugLoc dl = Op.getDebugLoc();
7350  SDValue Result = DAG.getBlockAddress(BA, getPointerTy(),
7351                                       /*isTarget=*/true, OpFlags);
7352
7353  if (Subtarget->isPICStyleRIPRel() &&
7354      (M == CodeModel::Small || M == CodeModel::Kernel))
7355    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
7356  else
7357    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
7358
7359  // With PIC, the address is actually $g + Offset.
7360  if (isGlobalRelativeToPICBase(OpFlags)) {
7361    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
7362                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
7363                         Result);
7364  }
7365
7366  return Result;
7367}
7368
7369SDValue
7370X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
7371                                      int64_t Offset,
7372                                      SelectionDAG &DAG) const {
7373  // Create the TargetGlobalAddress node, folding in the constant
7374  // offset if it is legal.
7375  unsigned char OpFlags =
7376    Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
7377  CodeModel::Model M = getTargetMachine().getCodeModel();
7378  SDValue Result;
7379  if (OpFlags == X86II::MO_NO_FLAG &&
7380      X86::isOffsetSuitableForCodeModel(Offset, M)) {
7381    // A direct static reference to a global.
7382    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
7383    Offset = 0;
7384  } else {
7385    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
7386  }
7387
7388  if (Subtarget->isPICStyleRIPRel() &&
7389      (M == CodeModel::Small || M == CodeModel::Kernel))
7390    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
7391  else
7392    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
7393
7394  // With PIC, the address is actually $g + Offset.
7395  if (isGlobalRelativeToPICBase(OpFlags)) {
7396    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
7397                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
7398                         Result);
7399  }
7400
7401  // For globals that require a load from a stub to get the address, emit the
7402  // load.
7403  if (isGlobalStubReference(OpFlags))
7404    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
7405                         MachinePointerInfo::getGOT(), false, false, false, 0);
7406
7407  // If there was a non-zero offset that we didn't fold, create an explicit
7408  // addition for it.
7409  if (Offset != 0)
7410    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
7411                         DAG.getConstant(Offset, getPointerTy()));
7412
7413  return Result;
7414}
7415
7416SDValue
7417X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
7418  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
7419  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
7420  return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
7421}
7422
7423static SDValue
7424GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
7425           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
7426           unsigned char OperandFlags, bool LocalDynamic = false) {
7427  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7428  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
7429  DebugLoc dl = GA->getDebugLoc();
7430  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7431                                           GA->getValueType(0),
7432                                           GA->getOffset(),
7433                                           OperandFlags);
7434
7435  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
7436                                           : X86ISD::TLSADDR;
7437
7438  if (InFlag) {
7439    SDValue Ops[] = { Chain,  TGA, *InFlag };
7440    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 3);
7441  } else {
7442    SDValue Ops[]  = { Chain, TGA };
7443    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 2);
7444  }
7445
7446  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
7447  MFI->setAdjustsStack(true);
7448
7449  SDValue Flag = Chain.getValue(1);
7450  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
7451}
7452
7453// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
7454static SDValue
7455LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
7456                                const EVT PtrVT) {
7457  SDValue InFlag;
7458  DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
7459  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
7460                                     DAG.getNode(X86ISD::GlobalBaseReg,
7461                                                 DebugLoc(), PtrVT), InFlag);
7462  InFlag = Chain.getValue(1);
7463
7464  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
7465}
7466
7467// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
7468static SDValue
7469LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
7470                                const EVT PtrVT) {
7471  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
7472                    X86::RAX, X86II::MO_TLSGD);
7473}
7474
7475static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
7476                                           SelectionDAG &DAG,
7477                                           const EVT PtrVT,
7478                                           bool is64Bit) {
7479  DebugLoc dl = GA->getDebugLoc();
7480
7481  // Get the start address of the TLS block for this module.
7482  X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
7483      .getInfo<X86MachineFunctionInfo>();
7484  MFI->incNumLocalDynamicTLSAccesses();
7485
7486  SDValue Base;
7487  if (is64Bit) {
7488    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
7489                      X86II::MO_TLSLD, /*LocalDynamic=*/true);
7490  } else {
7491    SDValue InFlag;
7492    SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
7493        DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag);
7494    InFlag = Chain.getValue(1);
7495    Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
7496                      X86II::MO_TLSLDM, /*LocalDynamic=*/true);
7497  }
7498
7499  // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
7500  // of Base.
7501
7502  // Build x@dtpoff.
7503  unsigned char OperandFlags = X86II::MO_DTPOFF;
7504  unsigned WrapperKind = X86ISD::Wrapper;
7505  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7506                                           GA->getValueType(0),
7507                                           GA->getOffset(), OperandFlags);
7508  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
7509
7510  // Add x@dtpoff with the base.
7511  return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
7512}
7513
7514// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
7515static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
7516                                   const EVT PtrVT, TLSModel::Model model,
7517                                   bool is64Bit, bool isPIC) {
7518  DebugLoc dl = GA->getDebugLoc();
7519
7520  // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
7521  Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
7522                                                         is64Bit ? 257 : 256));
7523
7524  SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
7525                                      DAG.getIntPtrConstant(0),
7526                                      MachinePointerInfo(Ptr),
7527                                      false, false, false, 0);
7528
7529  unsigned char OperandFlags = 0;
7530  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
7531  // initialexec.
7532  unsigned WrapperKind = X86ISD::Wrapper;
7533  if (model == TLSModel::LocalExec) {
7534    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
7535  } else if (model == TLSModel::InitialExec) {
7536    if (is64Bit) {
7537      OperandFlags = X86II::MO_GOTTPOFF;
7538      WrapperKind = X86ISD::WrapperRIP;
7539    } else {
7540      OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
7541    }
7542  } else {
7543    llvm_unreachable("Unexpected model");
7544  }
7545
7546  // emit "addl x@ntpoff,%eax" (local exec)
7547  // or "addl x@indntpoff,%eax" (initial exec)
7548  // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
7549  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7550                                           GA->getValueType(0),
7551                                           GA->getOffset(), OperandFlags);
7552  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
7553
7554  if (model == TLSModel::InitialExec) {
7555    if (isPIC && !is64Bit) {
7556      Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
7557                          DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT),
7558                           Offset);
7559    }
7560
7561    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
7562                         MachinePointerInfo::getGOT(), false, false, false,
7563                         0);
7564  }
7565
7566  // The address of the thread local variable is the add of the thread
7567  // pointer with the offset of the variable.
7568  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
7569}
7570
7571SDValue
7572X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
7573
7574  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
7575  const GlobalValue *GV = GA->getGlobal();
7576
7577  if (Subtarget->isTargetELF()) {
7578    TLSModel::Model model = getTargetMachine().getTLSModel(GV);
7579
7580    switch (model) {
7581      case TLSModel::GeneralDynamic:
7582        if (Subtarget->is64Bit())
7583          return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
7584        return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
7585      case TLSModel::LocalDynamic:
7586        return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
7587                                           Subtarget->is64Bit());
7588      case TLSModel::InitialExec:
7589      case TLSModel::LocalExec:
7590        return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
7591                                   Subtarget->is64Bit(),
7592                         getTargetMachine().getRelocationModel() == Reloc::PIC_);
7593    }
7594    llvm_unreachable("Unknown TLS model.");
7595  }
7596
7597  if (Subtarget->isTargetDarwin()) {
7598    // Darwin only has one model of TLS.  Lower to that.
7599    unsigned char OpFlag = 0;
7600    unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
7601                           X86ISD::WrapperRIP : X86ISD::Wrapper;
7602
7603    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
7604    // global base reg.
7605    bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
7606                  !Subtarget->is64Bit();
7607    if (PIC32)
7608      OpFlag = X86II::MO_TLVP_PIC_BASE;
7609    else
7610      OpFlag = X86II::MO_TLVP;
7611    DebugLoc DL = Op.getDebugLoc();
7612    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
7613                                                GA->getValueType(0),
7614                                                GA->getOffset(), OpFlag);
7615    SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
7616
7617    // With PIC32, the address is actually $g + Offset.
7618    if (PIC32)
7619      Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
7620                           DAG.getNode(X86ISD::GlobalBaseReg,
7621                                       DebugLoc(), getPointerTy()),
7622                           Offset);
7623
7624    // Lowering the machine isd will make sure everything is in the right
7625    // location.
7626    SDValue Chain = DAG.getEntryNode();
7627    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
7628    SDValue Args[] = { Chain, Offset };
7629    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2);
7630
7631    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
7632    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7633    MFI->setAdjustsStack(true);
7634
7635    // And our return value (tls address) is in the standard call return value
7636    // location.
7637    unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
7638    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
7639                              Chain.getValue(1));
7640  }
7641
7642  if (Subtarget->isTargetWindows()) {
7643    // Just use the implicit TLS architecture
7644    // Need to generate someting similar to:
7645    //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
7646    //                                  ; from TEB
7647    //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
7648    //   mov     rcx, qword [rdx+rcx*8]
7649    //   mov     eax, .tls$:tlsvar
7650    //   [rax+rcx] contains the address
7651    // Windows 64bit: gs:0x58
7652    // Windows 32bit: fs:__tls_array
7653
7654    // If GV is an alias then use the aliasee for determining
7655    // thread-localness.
7656    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
7657      GV = GA->resolveAliasedGlobal(false);
7658    DebugLoc dl = GA->getDebugLoc();
7659    SDValue Chain = DAG.getEntryNode();
7660
7661    // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
7662    // %gs:0x58 (64-bit).
7663    Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
7664                                        ? Type::getInt8PtrTy(*DAG.getContext(),
7665                                                             256)
7666                                        : Type::getInt32PtrTy(*DAG.getContext(),
7667                                                              257));
7668
7669    SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain,
7670                                        Subtarget->is64Bit()
7671                                        ? DAG.getIntPtrConstant(0x58)
7672                                        : DAG.getExternalSymbol("_tls_array",
7673                                                                getPointerTy()),
7674                                        MachinePointerInfo(Ptr),
7675                                        false, false, false, 0);
7676
7677    // Load the _tls_index variable
7678    SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
7679    if (Subtarget->is64Bit())
7680      IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
7681                           IDX, MachinePointerInfo(), MVT::i32,
7682                           false, false, 0);
7683    else
7684      IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
7685                        false, false, false, 0);
7686
7687    SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
7688                                    getPointerTy());
7689    IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
7690
7691    SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
7692    res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
7693                      false, false, false, 0);
7694
7695    // Get the offset of start of .tls section
7696    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
7697                                             GA->getValueType(0),
7698                                             GA->getOffset(), X86II::MO_SECREL);
7699    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
7700
7701    // The address of the thread local variable is the add of the thread
7702    // pointer with the offset of the variable.
7703    return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
7704  }
7705
7706  llvm_unreachable("TLS not implemented for this target.");
7707}
7708
7709
7710/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
7711/// and take a 2 x i32 value to shift plus a shift amount.
7712SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
7713  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
7714  EVT VT = Op.getValueType();
7715  unsigned VTBits = VT.getSizeInBits();
7716  DebugLoc dl = Op.getDebugLoc();
7717  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
7718  SDValue ShOpLo = Op.getOperand(0);
7719  SDValue ShOpHi = Op.getOperand(1);
7720  SDValue ShAmt  = Op.getOperand(2);
7721  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
7722                                     DAG.getConstant(VTBits - 1, MVT::i8))
7723                       : DAG.getConstant(0, VT);
7724
7725  SDValue Tmp2, Tmp3;
7726  if (Op.getOpcode() == ISD::SHL_PARTS) {
7727    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
7728    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
7729  } else {
7730    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
7731    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
7732  }
7733
7734  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
7735                                DAG.getConstant(VTBits, MVT::i8));
7736  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
7737                             AndNode, DAG.getConstant(0, MVT::i8));
7738
7739  SDValue Hi, Lo;
7740  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
7741  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
7742  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
7743
7744  if (Op.getOpcode() == ISD::SHL_PARTS) {
7745    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
7746    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
7747  } else {
7748    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
7749    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
7750  }
7751
7752  SDValue Ops[2] = { Lo, Hi };
7753  return DAG.getMergeValues(Ops, 2, dl);
7754}
7755
7756SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
7757                                           SelectionDAG &DAG) const {
7758  EVT SrcVT = Op.getOperand(0).getValueType();
7759
7760  if (SrcVT.isVector())
7761    return SDValue();
7762
7763  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
7764         "Unknown SINT_TO_FP to lower!");
7765
7766  // These are really Legal; return the operand so the caller accepts it as
7767  // Legal.
7768  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
7769    return Op;
7770  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
7771      Subtarget->is64Bit()) {
7772    return Op;
7773  }
7774
7775  DebugLoc dl = Op.getDebugLoc();
7776  unsigned Size = SrcVT.getSizeInBits()/8;
7777  MachineFunction &MF = DAG.getMachineFunction();
7778  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
7779  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
7780  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
7781                               StackSlot,
7782                               MachinePointerInfo::getFixedStack(SSFI),
7783                               false, false, 0);
7784  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
7785}
7786
7787SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
7788                                     SDValue StackSlot,
7789                                     SelectionDAG &DAG) const {
7790  // Build the FILD
7791  DebugLoc DL = Op.getDebugLoc();
7792  SDVTList Tys;
7793  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
7794  if (useSSE)
7795    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
7796  else
7797    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
7798
7799  unsigned ByteSize = SrcVT.getSizeInBits()/8;
7800
7801  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
7802  MachineMemOperand *MMO;
7803  if (FI) {
7804    int SSFI = FI->getIndex();
7805    MMO =
7806      DAG.getMachineFunction()
7807      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
7808                            MachineMemOperand::MOLoad, ByteSize, ByteSize);
7809  } else {
7810    MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
7811    StackSlot = StackSlot.getOperand(1);
7812  }
7813  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
7814  SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
7815                                           X86ISD::FILD, DL,
7816                                           Tys, Ops, array_lengthof(Ops),
7817                                           SrcVT, MMO);
7818
7819  if (useSSE) {
7820    Chain = Result.getValue(1);
7821    SDValue InFlag = Result.getValue(2);
7822
7823    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
7824    // shouldn't be necessary except that RFP cannot be live across
7825    // multiple blocks. When stackifier is fixed, they can be uncoupled.
7826    MachineFunction &MF = DAG.getMachineFunction();
7827    unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
7828    int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
7829    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
7830    Tys = DAG.getVTList(MVT::Other);
7831    SDValue Ops[] = {
7832      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
7833    };
7834    MachineMemOperand *MMO =
7835      DAG.getMachineFunction()
7836      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
7837                            MachineMemOperand::MOStore, SSFISize, SSFISize);
7838
7839    Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
7840                                    Ops, array_lengthof(Ops),
7841                                    Op.getValueType(), MMO);
7842    Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
7843                         MachinePointerInfo::getFixedStack(SSFI),
7844                         false, false, false, 0);
7845  }
7846
7847  return Result;
7848}
7849
7850// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
7851SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
7852                                               SelectionDAG &DAG) const {
7853  // This algorithm is not obvious. Here it is what we're trying to output:
7854  /*
7855     movq       %rax,  %xmm0
7856     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
7857     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
7858     #ifdef __SSE3__
7859       haddpd   %xmm0, %xmm0
7860     #else
7861       pshufd   $0x4e, %xmm0, %xmm1
7862       addpd    %xmm1, %xmm0
7863     #endif
7864  */
7865
7866  DebugLoc dl = Op.getDebugLoc();
7867  LLVMContext *Context = DAG.getContext();
7868
7869  // Build some magic constants.
7870  const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
7871  Constant *C0 = ConstantDataVector::get(*Context, CV0);
7872  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
7873
7874  SmallVector<Constant*,2> CV1;
7875  CV1.push_back(
7876        ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
7877  CV1.push_back(
7878        ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
7879  Constant *C1 = ConstantVector::get(CV1);
7880  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
7881
7882  // Load the 64-bit value into an XMM register.
7883  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
7884                            Op.getOperand(0));
7885  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
7886                              MachinePointerInfo::getConstantPool(),
7887                              false, false, false, 16);
7888  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
7889                              DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
7890                              CLod0);
7891
7892  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
7893                              MachinePointerInfo::getConstantPool(),
7894                              false, false, false, 16);
7895  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
7896  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
7897  SDValue Result;
7898
7899  if (Subtarget->hasSSE3()) {
7900    // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
7901    Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
7902  } else {
7903    SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
7904    SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
7905                                           S2F, 0x4E, DAG);
7906    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
7907                         DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
7908                         Sub);
7909  }
7910
7911  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
7912                     DAG.getIntPtrConstant(0));
7913}
7914
7915// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
7916SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
7917                                               SelectionDAG &DAG) const {
7918  DebugLoc dl = Op.getDebugLoc();
7919  // FP constant to bias correct the final result.
7920  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
7921                                   MVT::f64);
7922
7923  // Load the 32-bit value into an XMM register.
7924  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
7925                             Op.getOperand(0));
7926
7927  // Zero out the upper parts of the register.
7928  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
7929
7930  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
7931                     DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
7932                     DAG.getIntPtrConstant(0));
7933
7934  // Or the load with the bias.
7935  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
7936                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
7937                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7938                                                   MVT::v2f64, Load)),
7939                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
7940                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7941                                                   MVT::v2f64, Bias)));
7942  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
7943                   DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
7944                   DAG.getIntPtrConstant(0));
7945
7946  // Subtract the bias.
7947  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
7948
7949  // Handle final rounding.
7950  EVT DestVT = Op.getValueType();
7951
7952  if (DestVT.bitsLT(MVT::f64))
7953    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
7954                       DAG.getIntPtrConstant(0));
7955  if (DestVT.bitsGT(MVT::f64))
7956    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
7957
7958  // Handle final rounding.
7959  return Sub;
7960}
7961
7962SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
7963                                           SelectionDAG &DAG) const {
7964  SDValue N0 = Op.getOperand(0);
7965  DebugLoc dl = Op.getDebugLoc();
7966
7967  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
7968  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
7969  // the optimization here.
7970  if (DAG.SignBitIsZero(N0))
7971    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
7972
7973  EVT SrcVT = N0.getValueType();
7974  EVT DstVT = Op.getValueType();
7975  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
7976    return LowerUINT_TO_FP_i64(Op, DAG);
7977  if (SrcVT == MVT::i32 && X86ScalarSSEf64)
7978    return LowerUINT_TO_FP_i32(Op, DAG);
7979  if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
7980    return SDValue();
7981
7982  // Make a 64-bit buffer, and use it to build an FILD.
7983  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
7984  if (SrcVT == MVT::i32) {
7985    SDValue WordOff = DAG.getConstant(4, getPointerTy());
7986    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
7987                                     getPointerTy(), StackSlot, WordOff);
7988    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
7989                                  StackSlot, MachinePointerInfo(),
7990                                  false, false, 0);
7991    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
7992                                  OffsetSlot, MachinePointerInfo(),
7993                                  false, false, 0);
7994    SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
7995    return Fild;
7996  }
7997
7998  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
7999  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
8000                               StackSlot, MachinePointerInfo(),
8001                               false, false, 0);
8002  // For i64 source, we need to add the appropriate power of 2 if the input
8003  // was negative.  This is the same as the optimization in
8004  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
8005  // we must be careful to do the computation in x87 extended precision, not
8006  // in SSE. (The generic code can't know it's OK to do this, or how to.)
8007  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
8008  MachineMemOperand *MMO =
8009    DAG.getMachineFunction()
8010    .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
8011                          MachineMemOperand::MOLoad, 8, 8);
8012
8013  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
8014  SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
8015  SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3,
8016                                         MVT::i64, MMO);
8017
8018  APInt FF(32, 0x5F800000ULL);
8019
8020  // Check whether the sign bit is set.
8021  SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
8022                                 Op.getOperand(0), DAG.getConstant(0, MVT::i64),
8023                                 ISD::SETLT);
8024
8025  // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
8026  SDValue FudgePtr = DAG.getConstantPool(
8027                             ConstantInt::get(*DAG.getContext(), FF.zext(64)),
8028                                         getPointerTy());
8029
8030  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
8031  SDValue Zero = DAG.getIntPtrConstant(0);
8032  SDValue Four = DAG.getIntPtrConstant(4);
8033  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
8034                               Zero, Four);
8035  FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
8036
8037  // Load the value out, extending it from f32 to f80.
8038  // FIXME: Avoid the extend by constructing the right constant pool?
8039  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
8040                                 FudgePtr, MachinePointerInfo::getConstantPool(),
8041                                 MVT::f32, false, false, 4);
8042  // Extend everything to 80 bits to force it to be done on x87.
8043  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
8044  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
8045}
8046
8047std::pair<SDValue,SDValue> X86TargetLowering::
8048FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const {
8049  DebugLoc DL = Op.getDebugLoc();
8050
8051  EVT DstTy = Op.getValueType();
8052
8053  if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
8054    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
8055    DstTy = MVT::i64;
8056  }
8057
8058  assert(DstTy.getSimpleVT() <= MVT::i64 &&
8059         DstTy.getSimpleVT() >= MVT::i16 &&
8060         "Unknown FP_TO_INT to lower!");
8061
8062  // These are really Legal.
8063  if (DstTy == MVT::i32 &&
8064      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
8065    return std::make_pair(SDValue(), SDValue());
8066  if (Subtarget->is64Bit() &&
8067      DstTy == MVT::i64 &&
8068      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
8069    return std::make_pair(SDValue(), SDValue());
8070
8071  // We lower FP->int64 either into FISTP64 followed by a load from a temporary
8072  // stack slot, or into the FTOL runtime function.
8073  MachineFunction &MF = DAG.getMachineFunction();
8074  unsigned MemSize = DstTy.getSizeInBits()/8;
8075  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
8076  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
8077
8078  unsigned Opc;
8079  if (!IsSigned && isIntegerTypeFTOL(DstTy))
8080    Opc = X86ISD::WIN_FTOL;
8081  else
8082    switch (DstTy.getSimpleVT().SimpleTy) {
8083    default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
8084    case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
8085    case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
8086    case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
8087    }
8088
8089  SDValue Chain = DAG.getEntryNode();
8090  SDValue Value = Op.getOperand(0);
8091  EVT TheVT = Op.getOperand(0).getValueType();
8092  // FIXME This causes a redundant load/store if the SSE-class value is already
8093  // in memory, such as if it is on the callstack.
8094  if (isScalarFPTypeInSSEReg(TheVT)) {
8095    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
8096    Chain = DAG.getStore(Chain, DL, Value, StackSlot,
8097                         MachinePointerInfo::getFixedStack(SSFI),
8098                         false, false, 0);
8099    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
8100    SDValue Ops[] = {
8101      Chain, StackSlot, DAG.getValueType(TheVT)
8102    };
8103
8104    MachineMemOperand *MMO =
8105      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
8106                              MachineMemOperand::MOLoad, MemSize, MemSize);
8107    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3,
8108                                    DstTy, MMO);
8109    Chain = Value.getValue(1);
8110    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
8111    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
8112  }
8113
8114  MachineMemOperand *MMO =
8115    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
8116                            MachineMemOperand::MOStore, MemSize, MemSize);
8117
8118  if (Opc != X86ISD::WIN_FTOL) {
8119    // Build the FP_TO_INT*_IN_MEM
8120    SDValue Ops[] = { Chain, Value, StackSlot };
8121    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
8122                                           Ops, 3, DstTy, MMO);
8123    return std::make_pair(FIST, StackSlot);
8124  } else {
8125    SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
8126      DAG.getVTList(MVT::Other, MVT::Glue),
8127      Chain, Value);
8128    SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
8129      MVT::i32, ftol.getValue(1));
8130    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
8131      MVT::i32, eax.getValue(2));
8132    SDValue Ops[] = { eax, edx };
8133    SDValue pair = IsReplace
8134      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, 2)
8135      : DAG.getMergeValues(Ops, 2, DL);
8136    return std::make_pair(pair, SDValue());
8137  }
8138}
8139
8140SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
8141                                           SelectionDAG &DAG) const {
8142  if (Op.getValueType().isVector())
8143    return SDValue();
8144
8145  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
8146    /*IsSigned=*/ true, /*IsReplace=*/ false);
8147  SDValue FIST = Vals.first, StackSlot = Vals.second;
8148  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
8149  if (FIST.getNode() == 0) return Op;
8150
8151  if (StackSlot.getNode())
8152    // Load the result.
8153    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
8154                       FIST, StackSlot, MachinePointerInfo(),
8155                       false, false, false, 0);
8156
8157  // The node is the result.
8158  return FIST;
8159}
8160
8161SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
8162                                           SelectionDAG &DAG) const {
8163  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
8164    /*IsSigned=*/ false, /*IsReplace=*/ false);
8165  SDValue FIST = Vals.first, StackSlot = Vals.second;
8166  assert(FIST.getNode() && "Unexpected failure");
8167
8168  if (StackSlot.getNode())
8169    // Load the result.
8170    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
8171                       FIST, StackSlot, MachinePointerInfo(),
8172                       false, false, false, 0);
8173
8174  // The node is the result.
8175  return FIST;
8176}
8177
8178SDValue X86TargetLowering::LowerFABS(SDValue Op,
8179                                     SelectionDAG &DAG) const {
8180  LLVMContext *Context = DAG.getContext();
8181  DebugLoc dl = Op.getDebugLoc();
8182  EVT VT = Op.getValueType();
8183  EVT EltVT = VT;
8184  if (VT.isVector())
8185    EltVT = VT.getVectorElementType();
8186  Constant *C;
8187  if (EltVT == MVT::f64) {
8188    C = ConstantVector::getSplat(2,
8189                ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
8190  } else {
8191    C = ConstantVector::getSplat(4,
8192               ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
8193  }
8194  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
8195  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
8196                             MachinePointerInfo::getConstantPool(),
8197                             false, false, false, 16);
8198  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
8199}
8200
8201SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
8202  LLVMContext *Context = DAG.getContext();
8203  DebugLoc dl = Op.getDebugLoc();
8204  EVT VT = Op.getValueType();
8205  EVT EltVT = VT;
8206  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
8207  if (VT.isVector()) {
8208    EltVT = VT.getVectorElementType();
8209    NumElts = VT.getVectorNumElements();
8210  }
8211  Constant *C;
8212  if (EltVT == MVT::f64)
8213    C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
8214  else
8215    C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
8216  C = ConstantVector::getSplat(NumElts, C);
8217  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
8218  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
8219                             MachinePointerInfo::getConstantPool(),
8220                             false, false, false, 16);
8221  if (VT.isVector()) {
8222    MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
8223    return DAG.getNode(ISD::BITCAST, dl, VT,
8224                       DAG.getNode(ISD::XOR, dl, XORVT,
8225                                   DAG.getNode(ISD::BITCAST, dl, XORVT,
8226                                               Op.getOperand(0)),
8227                                   DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
8228  }
8229
8230  return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
8231}
8232
8233SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8234  LLVMContext *Context = DAG.getContext();
8235  SDValue Op0 = Op.getOperand(0);
8236  SDValue Op1 = Op.getOperand(1);
8237  DebugLoc dl = Op.getDebugLoc();
8238  EVT VT = Op.getValueType();
8239  EVT SrcVT = Op1.getValueType();
8240
8241  // If second operand is smaller, extend it first.
8242  if (SrcVT.bitsLT(VT)) {
8243    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
8244    SrcVT = VT;
8245  }
8246  // And if it is bigger, shrink it first.
8247  if (SrcVT.bitsGT(VT)) {
8248    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
8249    SrcVT = VT;
8250  }
8251
8252  // At this point the operands and the result should have the same
8253  // type, and that won't be f80 since that is not custom lowered.
8254
8255  // First get the sign bit of second operand.
8256  SmallVector<Constant*,4> CV;
8257  if (SrcVT == MVT::f64) {
8258    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
8259    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
8260  } else {
8261    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
8262    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8263    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8264    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8265  }
8266  Constant *C = ConstantVector::get(CV);
8267  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
8268  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
8269                              MachinePointerInfo::getConstantPool(),
8270                              false, false, false, 16);
8271  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
8272
8273  // Shift sign bit right or left if the two operands have different types.
8274  if (SrcVT.bitsGT(VT)) {
8275    // Op0 is MVT::f32, Op1 is MVT::f64.
8276    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
8277    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
8278                          DAG.getConstant(32, MVT::i32));
8279    SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
8280    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
8281                          DAG.getIntPtrConstant(0));
8282  }
8283
8284  // Clear first operand sign bit.
8285  CV.clear();
8286  if (VT == MVT::f64) {
8287    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
8288    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
8289  } else {
8290    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
8291    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8292    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8293    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
8294  }
8295  C = ConstantVector::get(CV);
8296  CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
8297  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
8298                              MachinePointerInfo::getConstantPool(),
8299                              false, false, false, 16);
8300  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
8301
8302  // Or the value with the sign bit.
8303  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
8304}
8305
8306SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const {
8307  SDValue N0 = Op.getOperand(0);
8308  DebugLoc dl = Op.getDebugLoc();
8309  EVT VT = Op.getValueType();
8310
8311  // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
8312  SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
8313                                  DAG.getConstant(1, VT));
8314  return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
8315}
8316
8317/// Emit nodes that will be selected as "test Op0,Op0", or something
8318/// equivalent.
8319SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
8320                                    SelectionDAG &DAG) const {
8321  DebugLoc dl = Op.getDebugLoc();
8322
8323  // CF and OF aren't always set the way we want. Determine which
8324  // of these we need.
8325  bool NeedCF = false;
8326  bool NeedOF = false;
8327  switch (X86CC) {
8328  default: break;
8329  case X86::COND_A: case X86::COND_AE:
8330  case X86::COND_B: case X86::COND_BE:
8331    NeedCF = true;
8332    break;
8333  case X86::COND_G: case X86::COND_GE:
8334  case X86::COND_L: case X86::COND_LE:
8335  case X86::COND_O: case X86::COND_NO:
8336    NeedOF = true;
8337    break;
8338  }
8339
8340  // See if we can use the EFLAGS value from the operand instead of
8341  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
8342  // we prove that the arithmetic won't overflow, we can't use OF or CF.
8343  if (Op.getResNo() != 0 || NeedOF || NeedCF)
8344    // Emit a CMP with 0, which is the TEST pattern.
8345    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
8346                       DAG.getConstant(0, Op.getValueType()));
8347
8348  unsigned Opcode = 0;
8349  unsigned NumOperands = 0;
8350
8351  // Truncate operations may prevent the merge of the SETCC instruction
8352  // and the arithmetic intruction before it. Attempt to truncate the operands
8353  // of the arithmetic instruction and use a reduced bit-width instruction.
8354  bool NeedTruncation = false;
8355  SDValue ArithOp = Op;
8356  if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
8357    SDValue Arith = Op->getOperand(0);
8358    // Both the trunc and the arithmetic op need to have one user each.
8359    if (Arith->hasOneUse())
8360      switch (Arith.getOpcode()) {
8361        default: break;
8362        case ISD::ADD:
8363        case ISD::SUB:
8364        case ISD::AND:
8365        case ISD::OR:
8366        case ISD::XOR: {
8367          NeedTruncation = true;
8368          ArithOp = Arith;
8369        }
8370      }
8371  }
8372
8373  // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
8374  // which may be the result of a CAST.  We use the variable 'Op', which is the
8375  // non-casted variable when we check for possible users.
8376  switch (ArithOp.getOpcode()) {
8377  case ISD::ADD:
8378    // Due to an isel shortcoming, be conservative if this add is likely to be
8379    // selected as part of a load-modify-store instruction. When the root node
8380    // in a match is a store, isel doesn't know how to remap non-chain non-flag
8381    // uses of other nodes in the match, such as the ADD in this case. This
8382    // leads to the ADD being left around and reselected, with the result being
8383    // two adds in the output.  Alas, even if none our users are stores, that
8384    // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
8385    // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
8386    // climbing the DAG back to the root, and it doesn't seem to be worth the
8387    // effort.
8388    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
8389         UE = Op.getNode()->use_end(); UI != UE; ++UI)
8390      if (UI->getOpcode() != ISD::CopyToReg &&
8391          UI->getOpcode() != ISD::SETCC &&
8392          UI->getOpcode() != ISD::STORE)
8393        goto default_case;
8394
8395    if (ConstantSDNode *C =
8396        dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
8397      // An add of one will be selected as an INC.
8398      if (C->getAPIntValue() == 1) {
8399        Opcode = X86ISD::INC;
8400        NumOperands = 1;
8401        break;
8402      }
8403
8404      // An add of negative one (subtract of one) will be selected as a DEC.
8405      if (C->getAPIntValue().isAllOnesValue()) {
8406        Opcode = X86ISD::DEC;
8407        NumOperands = 1;
8408        break;
8409      }
8410    }
8411
8412    // Otherwise use a regular EFLAGS-setting add.
8413    Opcode = X86ISD::ADD;
8414    NumOperands = 2;
8415    break;
8416  case ISD::AND: {
8417    // If the primary and result isn't used, don't bother using X86ISD::AND,
8418    // because a TEST instruction will be better.
8419    bool NonFlagUse = false;
8420    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
8421           UE = Op.getNode()->use_end(); UI != UE; ++UI) {
8422      SDNode *User = *UI;
8423      unsigned UOpNo = UI.getOperandNo();
8424      if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
8425        // Look pass truncate.
8426        UOpNo = User->use_begin().getOperandNo();
8427        User = *User->use_begin();
8428      }
8429
8430      if (User->getOpcode() != ISD::BRCOND &&
8431          User->getOpcode() != ISD::SETCC &&
8432          !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) {
8433        NonFlagUse = true;
8434        break;
8435      }
8436    }
8437
8438    if (!NonFlagUse)
8439      break;
8440  }
8441    // FALL THROUGH
8442  case ISD::SUB:
8443  case ISD::OR:
8444  case ISD::XOR:
8445    // Due to the ISEL shortcoming noted above, be conservative if this op is
8446    // likely to be selected as part of a load-modify-store instruction.
8447    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
8448           UE = Op.getNode()->use_end(); UI != UE; ++UI)
8449      if (UI->getOpcode() == ISD::STORE)
8450        goto default_case;
8451
8452    // Otherwise use a regular EFLAGS-setting instruction.
8453    switch (ArithOp.getOpcode()) {
8454    default: llvm_unreachable("unexpected operator!");
8455    case ISD::SUB: Opcode = X86ISD::SUB; break;
8456    case ISD::OR:  Opcode = X86ISD::OR;  break;
8457    case ISD::XOR: Opcode = X86ISD::XOR; break;
8458    case ISD::AND: Opcode = X86ISD::AND; break;
8459    }
8460
8461    NumOperands = 2;
8462    break;
8463  case X86ISD::ADD:
8464  case X86ISD::SUB:
8465  case X86ISD::INC:
8466  case X86ISD::DEC:
8467  case X86ISD::OR:
8468  case X86ISD::XOR:
8469  case X86ISD::AND:
8470    return SDValue(Op.getNode(), 1);
8471  default:
8472  default_case:
8473    break;
8474  }
8475
8476  // If we found that truncation is beneficial, perform the truncation and
8477  // update 'Op'.
8478  if (NeedTruncation) {
8479    EVT VT = Op.getValueType();
8480    SDValue WideVal = Op->getOperand(0);
8481    EVT WideVT = WideVal.getValueType();
8482    unsigned ConvertedOp = 0;
8483    // Use a target machine opcode to prevent further DAGCombine
8484    // optimizations that may separate the arithmetic operations
8485    // from the setcc node.
8486    switch (WideVal.getOpcode()) {
8487      default: break;
8488      case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
8489      case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
8490      case ISD::AND: ConvertedOp = X86ISD::AND; break;
8491      case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
8492      case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
8493    }
8494
8495    if (ConvertedOp) {
8496      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8497      if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
8498        SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
8499        SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
8500        Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
8501      }
8502    }
8503  }
8504
8505  if (Opcode == 0)
8506    // Emit a CMP with 0, which is the TEST pattern.
8507    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
8508                       DAG.getConstant(0, Op.getValueType()));
8509
8510  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
8511  SmallVector<SDValue, 4> Ops;
8512  for (unsigned i = 0; i != NumOperands; ++i)
8513    Ops.push_back(Op.getOperand(i));
8514
8515  SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
8516  DAG.ReplaceAllUsesWith(Op, New);
8517  return SDValue(New.getNode(), 1);
8518}
8519
8520/// Emit nodes that will be selected as "cmp Op0,Op1", or something
8521/// equivalent.
8522SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
8523                                   SelectionDAG &DAG) const {
8524  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
8525    if (C->getAPIntValue() == 0)
8526      return EmitTest(Op0, X86CC, DAG);
8527
8528  DebugLoc dl = Op0.getDebugLoc();
8529  if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
8530       Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
8531    // Use SUB instead of CMP to enable CSE between SUB and CMP.
8532    SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
8533    SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
8534                              Op0, Op1);
8535    return SDValue(Sub.getNode(), 1);
8536  }
8537  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
8538}
8539
8540/// Convert a comparison if required by the subtarget.
8541SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
8542                                                 SelectionDAG &DAG) const {
8543  // If the subtarget does not support the FUCOMI instruction, floating-point
8544  // comparisons have to be converted.
8545  if (Subtarget->hasCMov() ||
8546      Cmp.getOpcode() != X86ISD::CMP ||
8547      !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
8548      !Cmp.getOperand(1).getValueType().isFloatingPoint())
8549    return Cmp;
8550
8551  // The instruction selector will select an FUCOM instruction instead of
8552  // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
8553  // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
8554  // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
8555  DebugLoc dl = Cmp.getDebugLoc();
8556  SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
8557  SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
8558  SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
8559                            DAG.getConstant(8, MVT::i8));
8560  SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
8561  return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
8562}
8563
8564/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
8565/// if it's possible.
8566SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
8567                                     DebugLoc dl, SelectionDAG &DAG) const {
8568  SDValue Op0 = And.getOperand(0);
8569  SDValue Op1 = And.getOperand(1);
8570  if (Op0.getOpcode() == ISD::TRUNCATE)
8571    Op0 = Op0.getOperand(0);
8572  if (Op1.getOpcode() == ISD::TRUNCATE)
8573    Op1 = Op1.getOperand(0);
8574
8575  SDValue LHS, RHS;
8576  if (Op1.getOpcode() == ISD::SHL)
8577    std::swap(Op0, Op1);
8578  if (Op0.getOpcode() == ISD::SHL) {
8579    if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
8580      if (And00C->getZExtValue() == 1) {
8581        // If we looked past a truncate, check that it's only truncating away
8582        // known zeros.
8583        unsigned BitWidth = Op0.getValueSizeInBits();
8584        unsigned AndBitWidth = And.getValueSizeInBits();
8585        if (BitWidth > AndBitWidth) {
8586          APInt Zeros, Ones;
8587          DAG.ComputeMaskedBits(Op0, Zeros, Ones);
8588          if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
8589            return SDValue();
8590        }
8591        LHS = Op1;
8592        RHS = Op0.getOperand(1);
8593      }
8594  } else if (Op1.getOpcode() == ISD::Constant) {
8595    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
8596    uint64_t AndRHSVal = AndRHS->getZExtValue();
8597    SDValue AndLHS = Op0;
8598
8599    if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
8600      LHS = AndLHS.getOperand(0);
8601      RHS = AndLHS.getOperand(1);
8602    }
8603
8604    // Use BT if the immediate can't be encoded in a TEST instruction.
8605    if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
8606      LHS = AndLHS;
8607      RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
8608    }
8609  }
8610
8611  if (LHS.getNode()) {
8612    // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
8613    // instruction.  Since the shift amount is in-range-or-undefined, we know
8614    // that doing a bittest on the i32 value is ok.  We extend to i32 because
8615    // the encoding for the i16 version is larger than the i32 version.
8616    // Also promote i16 to i32 for performance / code size reason.
8617    if (LHS.getValueType() == MVT::i8 ||
8618        LHS.getValueType() == MVT::i16)
8619      LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
8620
8621    // If the operand types disagree, extend the shift amount to match.  Since
8622    // BT ignores high bits (like shifts) we can use anyextend.
8623    if (LHS.getValueType() != RHS.getValueType())
8624      RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
8625
8626    SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
8627    unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
8628    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
8629                       DAG.getConstant(Cond, MVT::i8), BT);
8630  }
8631
8632  return SDValue();
8633}
8634
8635SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
8636
8637  if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG);
8638
8639  assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
8640  SDValue Op0 = Op.getOperand(0);
8641  SDValue Op1 = Op.getOperand(1);
8642  DebugLoc dl = Op.getDebugLoc();
8643  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8644
8645  // Optimize to BT if possible.
8646  // Lower (X & (1 << N)) == 0 to BT(X, N).
8647  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
8648  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
8649  if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
8650      Op1.getOpcode() == ISD::Constant &&
8651      cast<ConstantSDNode>(Op1)->isNullValue() &&
8652      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
8653    SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
8654    if (NewSetCC.getNode())
8655      return NewSetCC;
8656  }
8657
8658  // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
8659  // these.
8660  if (Op1.getOpcode() == ISD::Constant &&
8661      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
8662       cast<ConstantSDNode>(Op1)->isNullValue()) &&
8663      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
8664
8665    // If the input is a setcc, then reuse the input setcc or use a new one with
8666    // the inverted condition.
8667    if (Op0.getOpcode() == X86ISD::SETCC) {
8668      X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
8669      bool Invert = (CC == ISD::SETNE) ^
8670        cast<ConstantSDNode>(Op1)->isNullValue();
8671      if (!Invert) return Op0;
8672
8673      CCode = X86::GetOppositeBranchCondition(CCode);
8674      return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
8675                         DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
8676    }
8677  }
8678
8679  bool isFP = Op1.getValueType().isFloatingPoint();
8680  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
8681  if (X86CC == X86::COND_INVALID)
8682    return SDValue();
8683
8684  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
8685  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
8686  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
8687                     DAG.getConstant(X86CC, MVT::i8), EFLAGS);
8688}
8689
8690// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
8691// ones, and then concatenate the result back.
8692static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
8693  EVT VT = Op.getValueType();
8694
8695  assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
8696         "Unsupported value type for operation");
8697
8698  unsigned NumElems = VT.getVectorNumElements();
8699  DebugLoc dl = Op.getDebugLoc();
8700  SDValue CC = Op.getOperand(2);
8701
8702  // Extract the LHS vectors
8703  SDValue LHS = Op.getOperand(0);
8704  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
8705  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
8706
8707  // Extract the RHS vectors
8708  SDValue RHS = Op.getOperand(1);
8709  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
8710  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
8711
8712  // Issue the operation on the smaller types and concatenate the result back
8713  MVT EltVT = VT.getVectorElementType().getSimpleVT();
8714  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
8715  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
8716                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
8717                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
8718}
8719
8720
8721SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
8722  SDValue Cond;
8723  SDValue Op0 = Op.getOperand(0);
8724  SDValue Op1 = Op.getOperand(1);
8725  SDValue CC = Op.getOperand(2);
8726  EVT VT = Op.getValueType();
8727  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
8728  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
8729  DebugLoc dl = Op.getDebugLoc();
8730
8731  if (isFP) {
8732#ifndef NDEBUG
8733    EVT EltVT = Op0.getValueType().getVectorElementType();
8734    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
8735#endif
8736
8737    unsigned SSECC;
8738    bool Swap = false;
8739
8740    // SSE Condition code mapping:
8741    //  0 - EQ
8742    //  1 - LT
8743    //  2 - LE
8744    //  3 - UNORD
8745    //  4 - NEQ
8746    //  5 - NLT
8747    //  6 - NLE
8748    //  7 - ORD
8749    switch (SetCCOpcode) {
8750    default: llvm_unreachable("Unexpected SETCC condition");
8751    case ISD::SETOEQ:
8752    case ISD::SETEQ:  SSECC = 0; break;
8753    case ISD::SETOGT:
8754    case ISD::SETGT: Swap = true; // Fallthrough
8755    case ISD::SETLT:
8756    case ISD::SETOLT: SSECC = 1; break;
8757    case ISD::SETOGE:
8758    case ISD::SETGE: Swap = true; // Fallthrough
8759    case ISD::SETLE:
8760    case ISD::SETOLE: SSECC = 2; break;
8761    case ISD::SETUO:  SSECC = 3; break;
8762    case ISD::SETUNE:
8763    case ISD::SETNE:  SSECC = 4; break;
8764    case ISD::SETULE: Swap = true; // Fallthrough
8765    case ISD::SETUGE: SSECC = 5; break;
8766    case ISD::SETULT: Swap = true; // Fallthrough
8767    case ISD::SETUGT: SSECC = 6; break;
8768    case ISD::SETO:   SSECC = 7; break;
8769    case ISD::SETUEQ:
8770    case ISD::SETONE: SSECC = 8; break;
8771    }
8772    if (Swap)
8773      std::swap(Op0, Op1);
8774
8775    // In the two special cases we can't handle, emit two comparisons.
8776    if (SSECC == 8) {
8777      unsigned CC0, CC1;
8778      unsigned CombineOpc;
8779      if (SetCCOpcode == ISD::SETUEQ) {
8780        CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
8781      } else {
8782        assert(SetCCOpcode == ISD::SETONE);
8783        CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
8784      }
8785
8786      SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
8787                                 DAG.getConstant(CC0, MVT::i8));
8788      SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
8789                                 DAG.getConstant(CC1, MVT::i8));
8790      return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
8791    }
8792    // Handle all other FP comparisons here.
8793    return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
8794                       DAG.getConstant(SSECC, MVT::i8));
8795  }
8796
8797  // Break 256-bit integer vector compare into smaller ones.
8798  if (VT.is256BitVector() && !Subtarget->hasAVX2())
8799    return Lower256IntVSETCC(Op, DAG);
8800
8801  // We are handling one of the integer comparisons here.  Since SSE only has
8802  // GT and EQ comparisons for integer, swapping operands and multiple
8803  // operations may be required for some comparisons.
8804  unsigned Opc;
8805  bool Swap = false, Invert = false, FlipSigns = false;
8806
8807  switch (SetCCOpcode) {
8808  default: llvm_unreachable("Unexpected SETCC condition");
8809  case ISD::SETNE:  Invert = true;
8810  case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
8811  case ISD::SETLT:  Swap = true;
8812  case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
8813  case ISD::SETGE:  Swap = true;
8814  case ISD::SETLE:  Opc = X86ISD::PCMPGT; Invert = true; break;
8815  case ISD::SETULT: Swap = true;
8816  case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break;
8817  case ISD::SETUGE: Swap = true;
8818  case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break;
8819  }
8820  if (Swap)
8821    std::swap(Op0, Op1);
8822
8823  // Check that the operation in question is available (most are plain SSE2,
8824  // but PCMPGTQ and PCMPEQQ have different requirements).
8825  if (VT == MVT::v2i64) {
8826    if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42())
8827      return SDValue();
8828    if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41())
8829      return SDValue();
8830  }
8831
8832  // Since SSE has no unsigned integer comparisons, we need to flip  the sign
8833  // bits of the inputs before performing those operations.
8834  if (FlipSigns) {
8835    EVT EltVT = VT.getVectorElementType();
8836    SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
8837                                      EltVT);
8838    std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
8839    SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
8840                                    SignBits.size());
8841    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
8842    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
8843  }
8844
8845  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
8846
8847  // If the logical-not of the result is required, perform that now.
8848  if (Invert)
8849    Result = DAG.getNOT(dl, Result, VT);
8850
8851  return Result;
8852}
8853
8854// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
8855static bool isX86LogicalCmp(SDValue Op) {
8856  unsigned Opc = Op.getNode()->getOpcode();
8857  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
8858      Opc == X86ISD::SAHF)
8859    return true;
8860  if (Op.getResNo() == 1 &&
8861      (Opc == X86ISD::ADD ||
8862       Opc == X86ISD::SUB ||
8863       Opc == X86ISD::ADC ||
8864       Opc == X86ISD::SBB ||
8865       Opc == X86ISD::SMUL ||
8866       Opc == X86ISD::UMUL ||
8867       Opc == X86ISD::INC ||
8868       Opc == X86ISD::DEC ||
8869       Opc == X86ISD::OR ||
8870       Opc == X86ISD::XOR ||
8871       Opc == X86ISD::AND))
8872    return true;
8873
8874  if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
8875    return true;
8876
8877  return false;
8878}
8879
8880static bool isZero(SDValue V) {
8881  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
8882  return C && C->isNullValue();
8883}
8884
8885static bool isAllOnes(SDValue V) {
8886  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
8887  return C && C->isAllOnesValue();
8888}
8889
8890static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
8891  if (V.getOpcode() != ISD::TRUNCATE)
8892    return false;
8893
8894  SDValue VOp0 = V.getOperand(0);
8895  unsigned InBits = VOp0.getValueSizeInBits();
8896  unsigned Bits = V.getValueSizeInBits();
8897  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
8898}
8899
8900SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
8901  bool addTest = true;
8902  SDValue Cond  = Op.getOperand(0);
8903  SDValue Op1 = Op.getOperand(1);
8904  SDValue Op2 = Op.getOperand(2);
8905  DebugLoc DL = Op.getDebugLoc();
8906  SDValue CC;
8907
8908  if (Cond.getOpcode() == ISD::SETCC) {
8909    SDValue NewCond = LowerSETCC(Cond, DAG);
8910    if (NewCond.getNode())
8911      Cond = NewCond;
8912  }
8913
8914  // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
8915  // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
8916  // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
8917  // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
8918  if (Cond.getOpcode() == X86ISD::SETCC &&
8919      Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
8920      isZero(Cond.getOperand(1).getOperand(1))) {
8921    SDValue Cmp = Cond.getOperand(1);
8922
8923    unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
8924
8925    if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
8926        (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
8927      SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
8928
8929      SDValue CmpOp0 = Cmp.getOperand(0);
8930      // Apply further optimizations for special cases
8931      // (select (x != 0), -1, 0) -> neg & sbb
8932      // (select (x == 0), 0, -1) -> neg & sbb
8933      if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
8934        if (YC->isNullValue() &&
8935            (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
8936          SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
8937          SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
8938                                    DAG.getConstant(0, CmpOp0.getValueType()),
8939                                    CmpOp0);
8940          SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
8941                                    DAG.getConstant(X86::COND_B, MVT::i8),
8942                                    SDValue(Neg.getNode(), 1));
8943          return Res;
8944        }
8945
8946      Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
8947                        CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
8948      Cmp = ConvertCmpIfNecessary(Cmp, DAG);
8949
8950      SDValue Res =   // Res = 0 or -1.
8951        DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
8952                    DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
8953
8954      if (isAllOnes(Op1) != (CondCode == X86::COND_E))
8955        Res = DAG.getNOT(DL, Res, Res.getValueType());
8956
8957      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
8958      if (N2C == 0 || !N2C->isNullValue())
8959        Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
8960      return Res;
8961    }
8962  }
8963
8964  // Look past (and (setcc_carry (cmp ...)), 1).
8965  if (Cond.getOpcode() == ISD::AND &&
8966      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
8967    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
8968    if (C && C->getAPIntValue() == 1)
8969      Cond = Cond.getOperand(0);
8970  }
8971
8972  // If condition flag is set by a X86ISD::CMP, then use it as the condition
8973  // setting operand in place of the X86ISD::SETCC.
8974  unsigned CondOpcode = Cond.getOpcode();
8975  if (CondOpcode == X86ISD::SETCC ||
8976      CondOpcode == X86ISD::SETCC_CARRY) {
8977    CC = Cond.getOperand(0);
8978
8979    SDValue Cmp = Cond.getOperand(1);
8980    unsigned Opc = Cmp.getOpcode();
8981    EVT VT = Op.getValueType();
8982
8983    bool IllegalFPCMov = false;
8984    if (VT.isFloatingPoint() && !VT.isVector() &&
8985        !isScalarFPTypeInSSEReg(VT))  // FPStack?
8986      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
8987
8988    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
8989        Opc == X86ISD::BT) { // FIXME
8990      Cond = Cmp;
8991      addTest = false;
8992    }
8993  } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
8994             CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
8995             ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
8996              Cond.getOperand(0).getValueType() != MVT::i8)) {
8997    SDValue LHS = Cond.getOperand(0);
8998    SDValue RHS = Cond.getOperand(1);
8999    unsigned X86Opcode;
9000    unsigned X86Cond;
9001    SDVTList VTs;
9002    switch (CondOpcode) {
9003    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
9004    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
9005    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
9006    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
9007    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
9008    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
9009    default: llvm_unreachable("unexpected overflowing operator");
9010    }
9011    if (CondOpcode == ISD::UMULO)
9012      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
9013                          MVT::i32);
9014    else
9015      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
9016
9017    SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
9018
9019    if (CondOpcode == ISD::UMULO)
9020      Cond = X86Op.getValue(2);
9021    else
9022      Cond = X86Op.getValue(1);
9023
9024    CC = DAG.getConstant(X86Cond, MVT::i8);
9025    addTest = false;
9026  }
9027
9028  if (addTest) {
9029    // Look pass the truncate if the high bits are known zero.
9030    if (isTruncWithZeroHighBitsInput(Cond, DAG))
9031        Cond = Cond.getOperand(0);
9032
9033    // We know the result of AND is compared against zero. Try to match
9034    // it to BT.
9035    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
9036      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
9037      if (NewSetCC.getNode()) {
9038        CC = NewSetCC.getOperand(0);
9039        Cond = NewSetCC.getOperand(1);
9040        addTest = false;
9041      }
9042    }
9043  }
9044
9045  if (addTest) {
9046    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
9047    Cond = EmitTest(Cond, X86::COND_NE, DAG);
9048  }
9049
9050  // a <  b ? -1 :  0 -> RES = ~setcc_carry
9051  // a <  b ?  0 : -1 -> RES = setcc_carry
9052  // a >= b ? -1 :  0 -> RES = setcc_carry
9053  // a >= b ?  0 : -1 -> RES = ~setcc_carry
9054  if (Cond.getOpcode() == X86ISD::SUB) {
9055    Cond = ConvertCmpIfNecessary(Cond, DAG);
9056    unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
9057
9058    if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
9059        (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
9060      SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
9061                                DAG.getConstant(X86::COND_B, MVT::i8), Cond);
9062      if (isAllOnes(Op1) != (CondCode == X86::COND_B))
9063        return DAG.getNOT(DL, Res, Res.getValueType());
9064      return Res;
9065    }
9066  }
9067
9068  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
9069  // condition is true.
9070  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
9071  SDValue Ops[] = { Op2, Op1, CC, Cond };
9072  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
9073}
9074
9075// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
9076// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
9077// from the AND / OR.
9078static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
9079  Opc = Op.getOpcode();
9080  if (Opc != ISD::OR && Opc != ISD::AND)
9081    return false;
9082  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
9083          Op.getOperand(0).hasOneUse() &&
9084          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
9085          Op.getOperand(1).hasOneUse());
9086}
9087
9088// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
9089// 1 and that the SETCC node has a single use.
9090static bool isXor1OfSetCC(SDValue Op) {
9091  if (Op.getOpcode() != ISD::XOR)
9092    return false;
9093  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
9094  if (N1C && N1C->getAPIntValue() == 1) {
9095    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
9096      Op.getOperand(0).hasOneUse();
9097  }
9098  return false;
9099}
9100
9101SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
9102  bool addTest = true;
9103  SDValue Chain = Op.getOperand(0);
9104  SDValue Cond  = Op.getOperand(1);
9105  SDValue Dest  = Op.getOperand(2);
9106  DebugLoc dl = Op.getDebugLoc();
9107  SDValue CC;
9108  bool Inverted = false;
9109
9110  if (Cond.getOpcode() == ISD::SETCC) {
9111    // Check for setcc([su]{add,sub,mul}o == 0).
9112    if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
9113        isa<ConstantSDNode>(Cond.getOperand(1)) &&
9114        cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
9115        Cond.getOperand(0).getResNo() == 1 &&
9116        (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
9117         Cond.getOperand(0).getOpcode() == ISD::UADDO ||
9118         Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
9119         Cond.getOperand(0).getOpcode() == ISD::USUBO ||
9120         Cond.getOperand(0).getOpcode() == ISD::SMULO ||
9121         Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
9122      Inverted = true;
9123      Cond = Cond.getOperand(0);
9124    } else {
9125      SDValue NewCond = LowerSETCC(Cond, DAG);
9126      if (NewCond.getNode())
9127        Cond = NewCond;
9128    }
9129  }
9130#if 0
9131  // FIXME: LowerXALUO doesn't handle these!!
9132  else if (Cond.getOpcode() == X86ISD::ADD  ||
9133           Cond.getOpcode() == X86ISD::SUB  ||
9134           Cond.getOpcode() == X86ISD::SMUL ||
9135           Cond.getOpcode() == X86ISD::UMUL)
9136    Cond = LowerXALUO(Cond, DAG);
9137#endif
9138
9139  // Look pass (and (setcc_carry (cmp ...)), 1).
9140  if (Cond.getOpcode() == ISD::AND &&
9141      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
9142    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
9143    if (C && C->getAPIntValue() == 1)
9144      Cond = Cond.getOperand(0);
9145  }
9146
9147  // If condition flag is set by a X86ISD::CMP, then use it as the condition
9148  // setting operand in place of the X86ISD::SETCC.
9149  unsigned CondOpcode = Cond.getOpcode();
9150  if (CondOpcode == X86ISD::SETCC ||
9151      CondOpcode == X86ISD::SETCC_CARRY) {
9152    CC = Cond.getOperand(0);
9153
9154    SDValue Cmp = Cond.getOperand(1);
9155    unsigned Opc = Cmp.getOpcode();
9156    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
9157    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
9158      Cond = Cmp;
9159      addTest = false;
9160    } else {
9161      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
9162      default: break;
9163      case X86::COND_O:
9164      case X86::COND_B:
9165        // These can only come from an arithmetic instruction with overflow,
9166        // e.g. SADDO, UADDO.
9167        Cond = Cond.getNode()->getOperand(1);
9168        addTest = false;
9169        break;
9170      }
9171    }
9172  }
9173  CondOpcode = Cond.getOpcode();
9174  if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
9175      CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
9176      ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
9177       Cond.getOperand(0).getValueType() != MVT::i8)) {
9178    SDValue LHS = Cond.getOperand(0);
9179    SDValue RHS = Cond.getOperand(1);
9180    unsigned X86Opcode;
9181    unsigned X86Cond;
9182    SDVTList VTs;
9183    switch (CondOpcode) {
9184    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
9185    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
9186    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
9187    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
9188    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
9189    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
9190    default: llvm_unreachable("unexpected overflowing operator");
9191    }
9192    if (Inverted)
9193      X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
9194    if (CondOpcode == ISD::UMULO)
9195      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
9196                          MVT::i32);
9197    else
9198      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
9199
9200    SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
9201
9202    if (CondOpcode == ISD::UMULO)
9203      Cond = X86Op.getValue(2);
9204    else
9205      Cond = X86Op.getValue(1);
9206
9207    CC = DAG.getConstant(X86Cond, MVT::i8);
9208    addTest = false;
9209  } else {
9210    unsigned CondOpc;
9211    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
9212      SDValue Cmp = Cond.getOperand(0).getOperand(1);
9213      if (CondOpc == ISD::OR) {
9214        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
9215        // two branches instead of an explicit OR instruction with a
9216        // separate test.
9217        if (Cmp == Cond.getOperand(1).getOperand(1) &&
9218            isX86LogicalCmp(Cmp)) {
9219          CC = Cond.getOperand(0).getOperand(0);
9220          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9221                              Chain, Dest, CC, Cmp);
9222          CC = Cond.getOperand(1).getOperand(0);
9223          Cond = Cmp;
9224          addTest = false;
9225        }
9226      } else { // ISD::AND
9227        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
9228        // two branches instead of an explicit AND instruction with a
9229        // separate test. However, we only do this if this block doesn't
9230        // have a fall-through edge, because this requires an explicit
9231        // jmp when the condition is false.
9232        if (Cmp == Cond.getOperand(1).getOperand(1) &&
9233            isX86LogicalCmp(Cmp) &&
9234            Op.getNode()->hasOneUse()) {
9235          X86::CondCode CCode =
9236            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
9237          CCode = X86::GetOppositeBranchCondition(CCode);
9238          CC = DAG.getConstant(CCode, MVT::i8);
9239          SDNode *User = *Op.getNode()->use_begin();
9240          // Look for an unconditional branch following this conditional branch.
9241          // We need this because we need to reverse the successors in order
9242          // to implement FCMP_OEQ.
9243          if (User->getOpcode() == ISD::BR) {
9244            SDValue FalseBB = User->getOperand(1);
9245            SDNode *NewBR =
9246              DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
9247            assert(NewBR == User);
9248            (void)NewBR;
9249            Dest = FalseBB;
9250
9251            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9252                                Chain, Dest, CC, Cmp);
9253            X86::CondCode CCode =
9254              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
9255            CCode = X86::GetOppositeBranchCondition(CCode);
9256            CC = DAG.getConstant(CCode, MVT::i8);
9257            Cond = Cmp;
9258            addTest = false;
9259          }
9260        }
9261      }
9262    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
9263      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
9264      // It should be transformed during dag combiner except when the condition
9265      // is set by a arithmetics with overflow node.
9266      X86::CondCode CCode =
9267        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
9268      CCode = X86::GetOppositeBranchCondition(CCode);
9269      CC = DAG.getConstant(CCode, MVT::i8);
9270      Cond = Cond.getOperand(0).getOperand(1);
9271      addTest = false;
9272    } else if (Cond.getOpcode() == ISD::SETCC &&
9273               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
9274      // For FCMP_OEQ, we can emit
9275      // two branches instead of an explicit AND instruction with a
9276      // separate test. However, we only do this if this block doesn't
9277      // have a fall-through edge, because this requires an explicit
9278      // jmp when the condition is false.
9279      if (Op.getNode()->hasOneUse()) {
9280        SDNode *User = *Op.getNode()->use_begin();
9281        // Look for an unconditional branch following this conditional branch.
9282        // We need this because we need to reverse the successors in order
9283        // to implement FCMP_OEQ.
9284        if (User->getOpcode() == ISD::BR) {
9285          SDValue FalseBB = User->getOperand(1);
9286          SDNode *NewBR =
9287            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
9288          assert(NewBR == User);
9289          (void)NewBR;
9290          Dest = FalseBB;
9291
9292          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
9293                                    Cond.getOperand(0), Cond.getOperand(1));
9294          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
9295          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
9296          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9297                              Chain, Dest, CC, Cmp);
9298          CC = DAG.getConstant(X86::COND_P, MVT::i8);
9299          Cond = Cmp;
9300          addTest = false;
9301        }
9302      }
9303    } else if (Cond.getOpcode() == ISD::SETCC &&
9304               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
9305      // For FCMP_UNE, we can emit
9306      // two branches instead of an explicit AND instruction with a
9307      // separate test. However, we only do this if this block doesn't
9308      // have a fall-through edge, because this requires an explicit
9309      // jmp when the condition is false.
9310      if (Op.getNode()->hasOneUse()) {
9311        SDNode *User = *Op.getNode()->use_begin();
9312        // Look for an unconditional branch following this conditional branch.
9313        // We need this because we need to reverse the successors in order
9314        // to implement FCMP_UNE.
9315        if (User->getOpcode() == ISD::BR) {
9316          SDValue FalseBB = User->getOperand(1);
9317          SDNode *NewBR =
9318            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
9319          assert(NewBR == User);
9320          (void)NewBR;
9321
9322          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
9323                                    Cond.getOperand(0), Cond.getOperand(1));
9324          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
9325          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
9326          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9327                              Chain, Dest, CC, Cmp);
9328          CC = DAG.getConstant(X86::COND_NP, MVT::i8);
9329          Cond = Cmp;
9330          addTest = false;
9331          Dest = FalseBB;
9332        }
9333      }
9334    }
9335  }
9336
9337  if (addTest) {
9338    // Look pass the truncate if the high bits are known zero.
9339    if (isTruncWithZeroHighBitsInput(Cond, DAG))
9340        Cond = Cond.getOperand(0);
9341
9342    // We know the result of AND is compared against zero. Try to match
9343    // it to BT.
9344    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
9345      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
9346      if (NewSetCC.getNode()) {
9347        CC = NewSetCC.getOperand(0);
9348        Cond = NewSetCC.getOperand(1);
9349        addTest = false;
9350      }
9351    }
9352  }
9353
9354  if (addTest) {
9355    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
9356    Cond = EmitTest(Cond, X86::COND_NE, DAG);
9357  }
9358  Cond = ConvertCmpIfNecessary(Cond, DAG);
9359  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
9360                     Chain, Dest, CC, Cond);
9361}
9362
9363
9364// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
9365// Calls to _alloca is needed to probe the stack when allocating more than 4k
9366// bytes in one go. Touching the stack at 4K increments is necessary to ensure
9367// that the guard pages used by the OS virtual memory manager are allocated in
9368// correct sequence.
9369SDValue
9370X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
9371                                           SelectionDAG &DAG) const {
9372  assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
9373          getTargetMachine().Options.EnableSegmentedStacks) &&
9374         "This should be used only on Windows targets or when segmented stacks "
9375         "are being used");
9376  assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
9377  DebugLoc dl = Op.getDebugLoc();
9378
9379  // Get the inputs.
9380  SDValue Chain = Op.getOperand(0);
9381  SDValue Size  = Op.getOperand(1);
9382  // FIXME: Ensure alignment here
9383
9384  bool Is64Bit = Subtarget->is64Bit();
9385  EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
9386
9387  if (getTargetMachine().Options.EnableSegmentedStacks) {
9388    MachineFunction &MF = DAG.getMachineFunction();
9389    MachineRegisterInfo &MRI = MF.getRegInfo();
9390
9391    if (Is64Bit) {
9392      // The 64 bit implementation of segmented stacks needs to clobber both r10
9393      // r11. This makes it impossible to use it along with nested parameters.
9394      const Function *F = MF.getFunction();
9395
9396      for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
9397           I != E; ++I)
9398        if (I->hasNestAttr())
9399          report_fatal_error("Cannot use segmented stacks with functions that "
9400                             "have nested arguments.");
9401    }
9402
9403    const TargetRegisterClass *AddrRegClass =
9404      getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
9405    unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
9406    Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
9407    SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
9408                                DAG.getRegister(Vreg, SPTy));
9409    SDValue Ops1[2] = { Value, Chain };
9410    return DAG.getMergeValues(Ops1, 2, dl);
9411  } else {
9412    SDValue Flag;
9413    unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
9414
9415    Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
9416    Flag = Chain.getValue(1);
9417    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
9418
9419    Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
9420    Flag = Chain.getValue(1);
9421
9422    Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
9423
9424    SDValue Ops1[2] = { Chain.getValue(0), Chain };
9425    return DAG.getMergeValues(Ops1, 2, dl);
9426  }
9427}
9428
9429SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
9430  MachineFunction &MF = DAG.getMachineFunction();
9431  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
9432
9433  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9434  DebugLoc DL = Op.getDebugLoc();
9435
9436  if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
9437    // vastart just stores the address of the VarArgsFrameIndex slot into the
9438    // memory location argument.
9439    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
9440                                   getPointerTy());
9441    return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
9442                        MachinePointerInfo(SV), false, false, 0);
9443  }
9444
9445  // __va_list_tag:
9446  //   gp_offset         (0 - 6 * 8)
9447  //   fp_offset         (48 - 48 + 8 * 16)
9448  //   overflow_arg_area (point to parameters coming in memory).
9449  //   reg_save_area
9450  SmallVector<SDValue, 8> MemOps;
9451  SDValue FIN = Op.getOperand(1);
9452  // Store gp_offset
9453  SDValue Store = DAG.getStore(Op.getOperand(0), DL,
9454                               DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
9455                                               MVT::i32),
9456                               FIN, MachinePointerInfo(SV), false, false, 0);
9457  MemOps.push_back(Store);
9458
9459  // Store fp_offset
9460  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
9461                    FIN, DAG.getIntPtrConstant(4));
9462  Store = DAG.getStore(Op.getOperand(0), DL,
9463                       DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
9464                                       MVT::i32),
9465                       FIN, MachinePointerInfo(SV, 4), false, false, 0);
9466  MemOps.push_back(Store);
9467
9468  // Store ptr to overflow_arg_area
9469  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
9470                    FIN, DAG.getIntPtrConstant(4));
9471  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
9472                                    getPointerTy());
9473  Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
9474                       MachinePointerInfo(SV, 8),
9475                       false, false, 0);
9476  MemOps.push_back(Store);
9477
9478  // Store ptr to reg_save_area.
9479  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
9480                    FIN, DAG.getIntPtrConstant(8));
9481  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
9482                                    getPointerTy());
9483  Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
9484                       MachinePointerInfo(SV, 16), false, false, 0);
9485  MemOps.push_back(Store);
9486  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
9487                     &MemOps[0], MemOps.size());
9488}
9489
9490SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
9491  assert(Subtarget->is64Bit() &&
9492         "LowerVAARG only handles 64-bit va_arg!");
9493  assert((Subtarget->isTargetLinux() ||
9494          Subtarget->isTargetDarwin()) &&
9495          "Unhandled target in LowerVAARG");
9496  assert(Op.getNode()->getNumOperands() == 4);
9497  SDValue Chain = Op.getOperand(0);
9498  SDValue SrcPtr = Op.getOperand(1);
9499  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9500  unsigned Align = Op.getConstantOperandVal(3);
9501  DebugLoc dl = Op.getDebugLoc();
9502
9503  EVT ArgVT = Op.getNode()->getValueType(0);
9504  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
9505  uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy);
9506  uint8_t ArgMode;
9507
9508  // Decide which area this value should be read from.
9509  // TODO: Implement the AMD64 ABI in its entirety. This simple
9510  // selection mechanism works only for the basic types.
9511  if (ArgVT == MVT::f80) {
9512    llvm_unreachable("va_arg for f80 not yet implemented");
9513  } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
9514    ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
9515  } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
9516    ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
9517  } else {
9518    llvm_unreachable("Unhandled argument type in LowerVAARG");
9519  }
9520
9521  if (ArgMode == 2) {
9522    // Sanity Check: Make sure using fp_offset makes sense.
9523    assert(!getTargetMachine().Options.UseSoftFloat &&
9524           !(DAG.getMachineFunction()
9525                .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) &&
9526           Subtarget->hasSSE1());
9527  }
9528
9529  // Insert VAARG_64 node into the DAG
9530  // VAARG_64 returns two values: Variable Argument Address, Chain
9531  SmallVector<SDValue, 11> InstOps;
9532  InstOps.push_back(Chain);
9533  InstOps.push_back(SrcPtr);
9534  InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
9535  InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
9536  InstOps.push_back(DAG.getConstant(Align, MVT::i32));
9537  SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
9538  SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
9539                                          VTs, &InstOps[0], InstOps.size(),
9540                                          MVT::i64,
9541                                          MachinePointerInfo(SV),
9542                                          /*Align=*/0,
9543                                          /*Volatile=*/false,
9544                                          /*ReadMem=*/true,
9545                                          /*WriteMem=*/true);
9546  Chain = VAARG.getValue(1);
9547
9548  // Load the next argument and return it
9549  return DAG.getLoad(ArgVT, dl,
9550                     Chain,
9551                     VAARG,
9552                     MachinePointerInfo(),
9553                     false, false, false, 0);
9554}
9555
9556SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
9557  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
9558  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
9559  SDValue Chain = Op.getOperand(0);
9560  SDValue DstPtr = Op.getOperand(1);
9561  SDValue SrcPtr = Op.getOperand(2);
9562  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
9563  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
9564  DebugLoc DL = Op.getDebugLoc();
9565
9566  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
9567                       DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
9568                       false,
9569                       MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
9570}
9571
9572// getTargetVShiftNOde - Handle vector element shifts where the shift amount
9573// may or may not be a constant. Takes immediate version of shift as input.
9574static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
9575                                   SDValue SrcOp, SDValue ShAmt,
9576                                   SelectionDAG &DAG) {
9577  assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
9578
9579  if (isa<ConstantSDNode>(ShAmt)) {
9580    // Constant may be a TargetConstant. Use a regular constant.
9581    uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue();
9582    switch (Opc) {
9583      default: llvm_unreachable("Unknown target vector shift node");
9584      case X86ISD::VSHLI:
9585      case X86ISD::VSRLI:
9586      case X86ISD::VSRAI:
9587        return DAG.getNode(Opc, dl, VT, SrcOp,
9588                           DAG.getConstant(ShiftAmt, MVT::i32));
9589    }
9590  }
9591
9592  // Change opcode to non-immediate version
9593  switch (Opc) {
9594    default: llvm_unreachable("Unknown target vector shift node");
9595    case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
9596    case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
9597    case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
9598  }
9599
9600  // Need to build a vector containing shift amount
9601  // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0
9602  SDValue ShOps[4];
9603  ShOps[0] = ShAmt;
9604  ShOps[1] = DAG.getConstant(0, MVT::i32);
9605  ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
9606  ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
9607
9608  // The return type has to be a 128-bit type with the same element
9609  // type as the input type.
9610  MVT EltVT = VT.getVectorElementType().getSimpleVT();
9611  EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
9612
9613  ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
9614  return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
9615}
9616
9617SDValue
9618X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
9619  DebugLoc dl = Op.getDebugLoc();
9620  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
9621  switch (IntNo) {
9622  default: return SDValue();    // Don't custom lower most intrinsics.
9623  // Comparison intrinsics.
9624  case Intrinsic::x86_sse_comieq_ss:
9625  case Intrinsic::x86_sse_comilt_ss:
9626  case Intrinsic::x86_sse_comile_ss:
9627  case Intrinsic::x86_sse_comigt_ss:
9628  case Intrinsic::x86_sse_comige_ss:
9629  case Intrinsic::x86_sse_comineq_ss:
9630  case Intrinsic::x86_sse_ucomieq_ss:
9631  case Intrinsic::x86_sse_ucomilt_ss:
9632  case Intrinsic::x86_sse_ucomile_ss:
9633  case Intrinsic::x86_sse_ucomigt_ss:
9634  case Intrinsic::x86_sse_ucomige_ss:
9635  case Intrinsic::x86_sse_ucomineq_ss:
9636  case Intrinsic::x86_sse2_comieq_sd:
9637  case Intrinsic::x86_sse2_comilt_sd:
9638  case Intrinsic::x86_sse2_comile_sd:
9639  case Intrinsic::x86_sse2_comigt_sd:
9640  case Intrinsic::x86_sse2_comige_sd:
9641  case Intrinsic::x86_sse2_comineq_sd:
9642  case Intrinsic::x86_sse2_ucomieq_sd:
9643  case Intrinsic::x86_sse2_ucomilt_sd:
9644  case Intrinsic::x86_sse2_ucomile_sd:
9645  case Intrinsic::x86_sse2_ucomigt_sd:
9646  case Intrinsic::x86_sse2_ucomige_sd:
9647  case Intrinsic::x86_sse2_ucomineq_sd: {
9648    unsigned Opc;
9649    ISD::CondCode CC;
9650    switch (IntNo) {
9651    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
9652    case Intrinsic::x86_sse_comieq_ss:
9653    case Intrinsic::x86_sse2_comieq_sd:
9654      Opc = X86ISD::COMI;
9655      CC = ISD::SETEQ;
9656      break;
9657    case Intrinsic::x86_sse_comilt_ss:
9658    case Intrinsic::x86_sse2_comilt_sd:
9659      Opc = X86ISD::COMI;
9660      CC = ISD::SETLT;
9661      break;
9662    case Intrinsic::x86_sse_comile_ss:
9663    case Intrinsic::x86_sse2_comile_sd:
9664      Opc = X86ISD::COMI;
9665      CC = ISD::SETLE;
9666      break;
9667    case Intrinsic::x86_sse_comigt_ss:
9668    case Intrinsic::x86_sse2_comigt_sd:
9669      Opc = X86ISD::COMI;
9670      CC = ISD::SETGT;
9671      break;
9672    case Intrinsic::x86_sse_comige_ss:
9673    case Intrinsic::x86_sse2_comige_sd:
9674      Opc = X86ISD::COMI;
9675      CC = ISD::SETGE;
9676      break;
9677    case Intrinsic::x86_sse_comineq_ss:
9678    case Intrinsic::x86_sse2_comineq_sd:
9679      Opc = X86ISD::COMI;
9680      CC = ISD::SETNE;
9681      break;
9682    case Intrinsic::x86_sse_ucomieq_ss:
9683    case Intrinsic::x86_sse2_ucomieq_sd:
9684      Opc = X86ISD::UCOMI;
9685      CC = ISD::SETEQ;
9686      break;
9687    case Intrinsic::x86_sse_ucomilt_ss:
9688    case Intrinsic::x86_sse2_ucomilt_sd:
9689      Opc = X86ISD::UCOMI;
9690      CC = ISD::SETLT;
9691      break;
9692    case Intrinsic::x86_sse_ucomile_ss:
9693    case Intrinsic::x86_sse2_ucomile_sd:
9694      Opc = X86ISD::UCOMI;
9695      CC = ISD::SETLE;
9696      break;
9697    case Intrinsic::x86_sse_ucomigt_ss:
9698    case Intrinsic::x86_sse2_ucomigt_sd:
9699      Opc = X86ISD::UCOMI;
9700      CC = ISD::SETGT;
9701      break;
9702    case Intrinsic::x86_sse_ucomige_ss:
9703    case Intrinsic::x86_sse2_ucomige_sd:
9704      Opc = X86ISD::UCOMI;
9705      CC = ISD::SETGE;
9706      break;
9707    case Intrinsic::x86_sse_ucomineq_ss:
9708    case Intrinsic::x86_sse2_ucomineq_sd:
9709      Opc = X86ISD::UCOMI;
9710      CC = ISD::SETNE;
9711      break;
9712    }
9713
9714    SDValue LHS = Op.getOperand(1);
9715    SDValue RHS = Op.getOperand(2);
9716    unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
9717    assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
9718    SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
9719    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
9720                                DAG.getConstant(X86CC, MVT::i8), Cond);
9721    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
9722  }
9723
9724  // Arithmetic intrinsics.
9725  case Intrinsic::x86_sse2_pmulu_dq:
9726  case Intrinsic::x86_avx2_pmulu_dq:
9727    return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
9728                       Op.getOperand(1), Op.getOperand(2));
9729
9730  // SSE3/AVX horizontal add/sub intrinsics
9731  case Intrinsic::x86_sse3_hadd_ps:
9732  case Intrinsic::x86_sse3_hadd_pd:
9733  case Intrinsic::x86_avx_hadd_ps_256:
9734  case Intrinsic::x86_avx_hadd_pd_256:
9735  case Intrinsic::x86_sse3_hsub_ps:
9736  case Intrinsic::x86_sse3_hsub_pd:
9737  case Intrinsic::x86_avx_hsub_ps_256:
9738  case Intrinsic::x86_avx_hsub_pd_256:
9739  case Intrinsic::x86_ssse3_phadd_w_128:
9740  case Intrinsic::x86_ssse3_phadd_d_128:
9741  case Intrinsic::x86_avx2_phadd_w:
9742  case Intrinsic::x86_avx2_phadd_d:
9743  case Intrinsic::x86_ssse3_phsub_w_128:
9744  case Intrinsic::x86_ssse3_phsub_d_128:
9745  case Intrinsic::x86_avx2_phsub_w:
9746  case Intrinsic::x86_avx2_phsub_d: {
9747    unsigned Opcode;
9748    switch (IntNo) {
9749    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
9750    case Intrinsic::x86_sse3_hadd_ps:
9751    case Intrinsic::x86_sse3_hadd_pd:
9752    case Intrinsic::x86_avx_hadd_ps_256:
9753    case Intrinsic::x86_avx_hadd_pd_256:
9754      Opcode = X86ISD::FHADD;
9755      break;
9756    case Intrinsic::x86_sse3_hsub_ps:
9757    case Intrinsic::x86_sse3_hsub_pd:
9758    case Intrinsic::x86_avx_hsub_ps_256:
9759    case Intrinsic::x86_avx_hsub_pd_256:
9760      Opcode = X86ISD::FHSUB;
9761      break;
9762    case Intrinsic::x86_ssse3_phadd_w_128:
9763    case Intrinsic::x86_ssse3_phadd_d_128:
9764    case Intrinsic::x86_avx2_phadd_w:
9765    case Intrinsic::x86_avx2_phadd_d:
9766      Opcode = X86ISD::HADD;
9767      break;
9768    case Intrinsic::x86_ssse3_phsub_w_128:
9769    case Intrinsic::x86_ssse3_phsub_d_128:
9770    case Intrinsic::x86_avx2_phsub_w:
9771    case Intrinsic::x86_avx2_phsub_d:
9772      Opcode = X86ISD::HSUB;
9773      break;
9774    }
9775    return DAG.getNode(Opcode, dl, Op.getValueType(),
9776                       Op.getOperand(1), Op.getOperand(2));
9777  }
9778
9779  // AVX2 variable shift intrinsics
9780  case Intrinsic::x86_avx2_psllv_d:
9781  case Intrinsic::x86_avx2_psllv_q:
9782  case Intrinsic::x86_avx2_psllv_d_256:
9783  case Intrinsic::x86_avx2_psllv_q_256:
9784  case Intrinsic::x86_avx2_psrlv_d:
9785  case Intrinsic::x86_avx2_psrlv_q:
9786  case Intrinsic::x86_avx2_psrlv_d_256:
9787  case Intrinsic::x86_avx2_psrlv_q_256:
9788  case Intrinsic::x86_avx2_psrav_d:
9789  case Intrinsic::x86_avx2_psrav_d_256: {
9790    unsigned Opcode;
9791    switch (IntNo) {
9792    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
9793    case Intrinsic::x86_avx2_psllv_d:
9794    case Intrinsic::x86_avx2_psllv_q:
9795    case Intrinsic::x86_avx2_psllv_d_256:
9796    case Intrinsic::x86_avx2_psllv_q_256:
9797      Opcode = ISD::SHL;
9798      break;
9799    case Intrinsic::x86_avx2_psrlv_d:
9800    case Intrinsic::x86_avx2_psrlv_q:
9801    case Intrinsic::x86_avx2_psrlv_d_256:
9802    case Intrinsic::x86_avx2_psrlv_q_256:
9803      Opcode = ISD::SRL;
9804      break;
9805    case Intrinsic::x86_avx2_psrav_d:
9806    case Intrinsic::x86_avx2_psrav_d_256:
9807      Opcode = ISD::SRA;
9808      break;
9809    }
9810    return DAG.getNode(Opcode, dl, Op.getValueType(),
9811                       Op.getOperand(1), Op.getOperand(2));
9812  }
9813
9814  case Intrinsic::x86_ssse3_pshuf_b_128:
9815  case Intrinsic::x86_avx2_pshuf_b:
9816    return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
9817                       Op.getOperand(1), Op.getOperand(2));
9818
9819  case Intrinsic::x86_ssse3_psign_b_128:
9820  case Intrinsic::x86_ssse3_psign_w_128:
9821  case Intrinsic::x86_ssse3_psign_d_128:
9822  case Intrinsic::x86_avx2_psign_b:
9823  case Intrinsic::x86_avx2_psign_w:
9824  case Intrinsic::x86_avx2_psign_d:
9825    return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
9826                       Op.getOperand(1), Op.getOperand(2));
9827
9828  case Intrinsic::x86_sse41_insertps:
9829    return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(),
9830                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
9831
9832  case Intrinsic::x86_avx_vperm2f128_ps_256:
9833  case Intrinsic::x86_avx_vperm2f128_pd_256:
9834  case Intrinsic::x86_avx_vperm2f128_si_256:
9835  case Intrinsic::x86_avx2_vperm2i128:
9836    return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
9837                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
9838
9839  case Intrinsic::x86_avx2_permd:
9840  case Intrinsic::x86_avx2_permps:
9841    // Operands intentionally swapped. Mask is last operand to intrinsic,
9842    // but second operand for node/intruction.
9843    return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
9844                       Op.getOperand(2), Op.getOperand(1));
9845
9846  // ptest and testp intrinsics. The intrinsic these come from are designed to
9847  // return an integer value, not just an instruction so lower it to the ptest
9848  // or testp pattern and a setcc for the result.
9849  case Intrinsic::x86_sse41_ptestz:
9850  case Intrinsic::x86_sse41_ptestc:
9851  case Intrinsic::x86_sse41_ptestnzc:
9852  case Intrinsic::x86_avx_ptestz_256:
9853  case Intrinsic::x86_avx_ptestc_256:
9854  case Intrinsic::x86_avx_ptestnzc_256:
9855  case Intrinsic::x86_avx_vtestz_ps:
9856  case Intrinsic::x86_avx_vtestc_ps:
9857  case Intrinsic::x86_avx_vtestnzc_ps:
9858  case Intrinsic::x86_avx_vtestz_pd:
9859  case Intrinsic::x86_avx_vtestc_pd:
9860  case Intrinsic::x86_avx_vtestnzc_pd:
9861  case Intrinsic::x86_avx_vtestz_ps_256:
9862  case Intrinsic::x86_avx_vtestc_ps_256:
9863  case Intrinsic::x86_avx_vtestnzc_ps_256:
9864  case Intrinsic::x86_avx_vtestz_pd_256:
9865  case Intrinsic::x86_avx_vtestc_pd_256:
9866  case Intrinsic::x86_avx_vtestnzc_pd_256: {
9867    bool IsTestPacked = false;
9868    unsigned X86CC;
9869    switch (IntNo) {
9870    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
9871    case Intrinsic::x86_avx_vtestz_ps:
9872    case Intrinsic::x86_avx_vtestz_pd:
9873    case Intrinsic::x86_avx_vtestz_ps_256:
9874    case Intrinsic::x86_avx_vtestz_pd_256:
9875      IsTestPacked = true; // Fallthrough
9876    case Intrinsic::x86_sse41_ptestz:
9877    case Intrinsic::x86_avx_ptestz_256:
9878      // ZF = 1
9879      X86CC = X86::COND_E;
9880      break;
9881    case Intrinsic::x86_avx_vtestc_ps:
9882    case Intrinsic::x86_avx_vtestc_pd:
9883    case Intrinsic::x86_avx_vtestc_ps_256:
9884    case Intrinsic::x86_avx_vtestc_pd_256:
9885      IsTestPacked = true; // Fallthrough
9886    case Intrinsic::x86_sse41_ptestc:
9887    case Intrinsic::x86_avx_ptestc_256:
9888      // CF = 1
9889      X86CC = X86::COND_B;
9890      break;
9891    case Intrinsic::x86_avx_vtestnzc_ps:
9892    case Intrinsic::x86_avx_vtestnzc_pd:
9893    case Intrinsic::x86_avx_vtestnzc_ps_256:
9894    case Intrinsic::x86_avx_vtestnzc_pd_256:
9895      IsTestPacked = true; // Fallthrough
9896    case Intrinsic::x86_sse41_ptestnzc:
9897    case Intrinsic::x86_avx_ptestnzc_256:
9898      // ZF and CF = 0
9899      X86CC = X86::COND_A;
9900      break;
9901    }
9902
9903    SDValue LHS = Op.getOperand(1);
9904    SDValue RHS = Op.getOperand(2);
9905    unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
9906    SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
9907    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
9908    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
9909    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
9910  }
9911
9912  // SSE/AVX shift intrinsics
9913  case Intrinsic::x86_sse2_psll_w:
9914  case Intrinsic::x86_sse2_psll_d:
9915  case Intrinsic::x86_sse2_psll_q:
9916  case Intrinsic::x86_avx2_psll_w:
9917  case Intrinsic::x86_avx2_psll_d:
9918  case Intrinsic::x86_avx2_psll_q:
9919  case Intrinsic::x86_sse2_psrl_w:
9920  case Intrinsic::x86_sse2_psrl_d:
9921  case Intrinsic::x86_sse2_psrl_q:
9922  case Intrinsic::x86_avx2_psrl_w:
9923  case Intrinsic::x86_avx2_psrl_d:
9924  case Intrinsic::x86_avx2_psrl_q:
9925  case Intrinsic::x86_sse2_psra_w:
9926  case Intrinsic::x86_sse2_psra_d:
9927  case Intrinsic::x86_avx2_psra_w:
9928  case Intrinsic::x86_avx2_psra_d: {
9929    unsigned Opcode;
9930    switch (IntNo) {
9931    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
9932    case Intrinsic::x86_sse2_psll_w:
9933    case Intrinsic::x86_sse2_psll_d:
9934    case Intrinsic::x86_sse2_psll_q:
9935    case Intrinsic::x86_avx2_psll_w:
9936    case Intrinsic::x86_avx2_psll_d:
9937    case Intrinsic::x86_avx2_psll_q:
9938      Opcode = X86ISD::VSHL;
9939      break;
9940    case Intrinsic::x86_sse2_psrl_w:
9941    case Intrinsic::x86_sse2_psrl_d:
9942    case Intrinsic::x86_sse2_psrl_q:
9943    case Intrinsic::x86_avx2_psrl_w:
9944    case Intrinsic::x86_avx2_psrl_d:
9945    case Intrinsic::x86_avx2_psrl_q:
9946      Opcode = X86ISD::VSRL;
9947      break;
9948    case Intrinsic::x86_sse2_psra_w:
9949    case Intrinsic::x86_sse2_psra_d:
9950    case Intrinsic::x86_avx2_psra_w:
9951    case Intrinsic::x86_avx2_psra_d:
9952      Opcode = X86ISD::VSRA;
9953      break;
9954    }
9955    return DAG.getNode(Opcode, dl, Op.getValueType(),
9956                       Op.getOperand(1), Op.getOperand(2));
9957  }
9958
9959  // SSE/AVX immediate shift intrinsics
9960  case Intrinsic::x86_sse2_pslli_w:
9961  case Intrinsic::x86_sse2_pslli_d:
9962  case Intrinsic::x86_sse2_pslli_q:
9963  case Intrinsic::x86_avx2_pslli_w:
9964  case Intrinsic::x86_avx2_pslli_d:
9965  case Intrinsic::x86_avx2_pslli_q:
9966  case Intrinsic::x86_sse2_psrli_w:
9967  case Intrinsic::x86_sse2_psrli_d:
9968  case Intrinsic::x86_sse2_psrli_q:
9969  case Intrinsic::x86_avx2_psrli_w:
9970  case Intrinsic::x86_avx2_psrli_d:
9971  case Intrinsic::x86_avx2_psrli_q:
9972  case Intrinsic::x86_sse2_psrai_w:
9973  case Intrinsic::x86_sse2_psrai_d:
9974  case Intrinsic::x86_avx2_psrai_w:
9975  case Intrinsic::x86_avx2_psrai_d: {
9976    unsigned Opcode;
9977    switch (IntNo) {
9978    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
9979    case Intrinsic::x86_sse2_pslli_w:
9980    case Intrinsic::x86_sse2_pslli_d:
9981    case Intrinsic::x86_sse2_pslli_q:
9982    case Intrinsic::x86_avx2_pslli_w:
9983    case Intrinsic::x86_avx2_pslli_d:
9984    case Intrinsic::x86_avx2_pslli_q:
9985      Opcode = X86ISD::VSHLI;
9986      break;
9987    case Intrinsic::x86_sse2_psrli_w:
9988    case Intrinsic::x86_sse2_psrli_d:
9989    case Intrinsic::x86_sse2_psrli_q:
9990    case Intrinsic::x86_avx2_psrli_w:
9991    case Intrinsic::x86_avx2_psrli_d:
9992    case Intrinsic::x86_avx2_psrli_q:
9993      Opcode = X86ISD::VSRLI;
9994      break;
9995    case Intrinsic::x86_sse2_psrai_w:
9996    case Intrinsic::x86_sse2_psrai_d:
9997    case Intrinsic::x86_avx2_psrai_w:
9998    case Intrinsic::x86_avx2_psrai_d:
9999      Opcode = X86ISD::VSRAI;
10000      break;
10001    }
10002    return getTargetVShiftNode(Opcode, dl, Op.getValueType(),
10003                               Op.getOperand(1), Op.getOperand(2), DAG);
10004  }
10005
10006  case Intrinsic::x86_sse42_pcmpistria128:
10007  case Intrinsic::x86_sse42_pcmpestria128:
10008  case Intrinsic::x86_sse42_pcmpistric128:
10009  case Intrinsic::x86_sse42_pcmpestric128:
10010  case Intrinsic::x86_sse42_pcmpistrio128:
10011  case Intrinsic::x86_sse42_pcmpestrio128:
10012  case Intrinsic::x86_sse42_pcmpistris128:
10013  case Intrinsic::x86_sse42_pcmpestris128:
10014  case Intrinsic::x86_sse42_pcmpistriz128:
10015  case Intrinsic::x86_sse42_pcmpestriz128: {
10016    unsigned Opcode;
10017    unsigned X86CC;
10018    switch (IntNo) {
10019    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10020    case Intrinsic::x86_sse42_pcmpistria128:
10021      Opcode = X86ISD::PCMPISTRI;
10022      X86CC = X86::COND_A;
10023      break;
10024    case Intrinsic::x86_sse42_pcmpestria128:
10025      Opcode = X86ISD::PCMPESTRI;
10026      X86CC = X86::COND_A;
10027      break;
10028    case Intrinsic::x86_sse42_pcmpistric128:
10029      Opcode = X86ISD::PCMPISTRI;
10030      X86CC = X86::COND_B;
10031      break;
10032    case Intrinsic::x86_sse42_pcmpestric128:
10033      Opcode = X86ISD::PCMPESTRI;
10034      X86CC = X86::COND_B;
10035      break;
10036    case Intrinsic::x86_sse42_pcmpistrio128:
10037      Opcode = X86ISD::PCMPISTRI;
10038      X86CC = X86::COND_O;
10039      break;
10040    case Intrinsic::x86_sse42_pcmpestrio128:
10041      Opcode = X86ISD::PCMPESTRI;
10042      X86CC = X86::COND_O;
10043      break;
10044    case Intrinsic::x86_sse42_pcmpistris128:
10045      Opcode = X86ISD::PCMPISTRI;
10046      X86CC = X86::COND_S;
10047      break;
10048    case Intrinsic::x86_sse42_pcmpestris128:
10049      Opcode = X86ISD::PCMPESTRI;
10050      X86CC = X86::COND_S;
10051      break;
10052    case Intrinsic::x86_sse42_pcmpistriz128:
10053      Opcode = X86ISD::PCMPISTRI;
10054      X86CC = X86::COND_E;
10055      break;
10056    case Intrinsic::x86_sse42_pcmpestriz128:
10057      Opcode = X86ISD::PCMPESTRI;
10058      X86CC = X86::COND_E;
10059      break;
10060    }
10061    SmallVector<SDValue, 5> NewOps;
10062    NewOps.append(Op->op_begin()+1, Op->op_end());
10063    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
10064    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
10065    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
10066                                DAG.getConstant(X86CC, MVT::i8),
10067                                SDValue(PCMP.getNode(), 1));
10068    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
10069  }
10070
10071  case Intrinsic::x86_sse42_pcmpistri128:
10072  case Intrinsic::x86_sse42_pcmpestri128: {
10073    unsigned Opcode;
10074    if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
10075      Opcode = X86ISD::PCMPISTRI;
10076    else
10077      Opcode = X86ISD::PCMPESTRI;
10078
10079    SmallVector<SDValue, 5> NewOps;
10080    NewOps.append(Op->op_begin()+1, Op->op_end());
10081    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
10082    return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
10083  }
10084  case Intrinsic::x86_fma_vfmadd_ps:
10085  case Intrinsic::x86_fma_vfmadd_pd:
10086  case Intrinsic::x86_fma_vfmsub_ps:
10087  case Intrinsic::x86_fma_vfmsub_pd:
10088  case Intrinsic::x86_fma_vfnmadd_ps:
10089  case Intrinsic::x86_fma_vfnmadd_pd:
10090  case Intrinsic::x86_fma_vfnmsub_ps:
10091  case Intrinsic::x86_fma_vfnmsub_pd:
10092  case Intrinsic::x86_fma_vfmaddsub_ps:
10093  case Intrinsic::x86_fma_vfmaddsub_pd:
10094  case Intrinsic::x86_fma_vfmsubadd_ps:
10095  case Intrinsic::x86_fma_vfmsubadd_pd:
10096  case Intrinsic::x86_fma_vfmadd_ps_256:
10097  case Intrinsic::x86_fma_vfmadd_pd_256:
10098  case Intrinsic::x86_fma_vfmsub_ps_256:
10099  case Intrinsic::x86_fma_vfmsub_pd_256:
10100  case Intrinsic::x86_fma_vfnmadd_ps_256:
10101  case Intrinsic::x86_fma_vfnmadd_pd_256:
10102  case Intrinsic::x86_fma_vfnmsub_ps_256:
10103  case Intrinsic::x86_fma_vfnmsub_pd_256:
10104  case Intrinsic::x86_fma_vfmaddsub_ps_256:
10105  case Intrinsic::x86_fma_vfmaddsub_pd_256:
10106  case Intrinsic::x86_fma_vfmsubadd_ps_256:
10107  case Intrinsic::x86_fma_vfmsubadd_pd_256: {
10108    unsigned Opc;
10109    switch (IntNo) {
10110    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
10111    case Intrinsic::x86_fma_vfmadd_ps:
10112    case Intrinsic::x86_fma_vfmadd_pd:
10113    case Intrinsic::x86_fma_vfmadd_ps_256:
10114    case Intrinsic::x86_fma_vfmadd_pd_256:
10115      Opc = X86ISD::FMADD;
10116      break;
10117    case Intrinsic::x86_fma_vfmsub_ps:
10118    case Intrinsic::x86_fma_vfmsub_pd:
10119    case Intrinsic::x86_fma_vfmsub_ps_256:
10120    case Intrinsic::x86_fma_vfmsub_pd_256:
10121      Opc = X86ISD::FMSUB;
10122      break;
10123    case Intrinsic::x86_fma_vfnmadd_ps:
10124    case Intrinsic::x86_fma_vfnmadd_pd:
10125    case Intrinsic::x86_fma_vfnmadd_ps_256:
10126    case Intrinsic::x86_fma_vfnmadd_pd_256:
10127      Opc = X86ISD::FNMADD;
10128      break;
10129    case Intrinsic::x86_fma_vfnmsub_ps:
10130    case Intrinsic::x86_fma_vfnmsub_pd:
10131    case Intrinsic::x86_fma_vfnmsub_ps_256:
10132    case Intrinsic::x86_fma_vfnmsub_pd_256:
10133      Opc = X86ISD::FNMSUB;
10134      break;
10135    case Intrinsic::x86_fma_vfmaddsub_ps:
10136    case Intrinsic::x86_fma_vfmaddsub_pd:
10137    case Intrinsic::x86_fma_vfmaddsub_ps_256:
10138    case Intrinsic::x86_fma_vfmaddsub_pd_256:
10139      Opc = X86ISD::FMADDSUB;
10140      break;
10141    case Intrinsic::x86_fma_vfmsubadd_ps:
10142    case Intrinsic::x86_fma_vfmsubadd_pd:
10143    case Intrinsic::x86_fma_vfmsubadd_ps_256:
10144    case Intrinsic::x86_fma_vfmsubadd_pd_256:
10145      Opc = X86ISD::FMSUBADD;
10146      break;
10147    }
10148
10149    return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
10150                       Op.getOperand(2), Op.getOperand(3));
10151  }
10152  }
10153}
10154
10155SDValue
10156X86TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const {
10157  DebugLoc dl = Op.getDebugLoc();
10158  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
10159  switch (IntNo) {
10160  default: return SDValue();    // Don't custom lower most intrinsics.
10161
10162  // RDRAND intrinsics.
10163  case Intrinsic::x86_rdrand_16:
10164  case Intrinsic::x86_rdrand_32:
10165  case Intrinsic::x86_rdrand_64: {
10166    // Emit the node with the right value type.
10167    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
10168    SDValue Result = DAG.getNode(X86ISD::RDRAND, dl, VTs, Op.getOperand(0));
10169
10170    // If the value returned by RDRAND was valid (CF=1), return 1. Otherwise
10171    // return the value from Rand, which is always 0, casted to i32.
10172    SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
10173                      DAG.getConstant(1, Op->getValueType(1)),
10174                      DAG.getConstant(X86::COND_B, MVT::i32),
10175                      SDValue(Result.getNode(), 1) };
10176    SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
10177                                  DAG.getVTList(Op->getValueType(1), MVT::Glue),
10178                                  Ops, 4);
10179
10180    // Return { result, isValid, chain }.
10181    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
10182                       SDValue(Result.getNode(), 2));
10183  }
10184  }
10185}
10186
10187SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
10188                                           SelectionDAG &DAG) const {
10189  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
10190  MFI->setReturnAddressIsTaken(true);
10191
10192  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
10193  DebugLoc dl = Op.getDebugLoc();
10194
10195  if (Depth > 0) {
10196    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
10197    SDValue Offset =
10198      DAG.getConstant(TD->getPointerSize(),
10199                      Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
10200    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
10201                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
10202                                   FrameAddr, Offset),
10203                       MachinePointerInfo(), false, false, false, 0);
10204  }
10205
10206  // Just load the return address.
10207  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
10208  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
10209                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
10210}
10211
10212SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
10213  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
10214  MFI->setFrameAddressIsTaken(true);
10215
10216  EVT VT = Op.getValueType();
10217  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
10218  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
10219  unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
10220  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
10221  while (Depth--)
10222    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
10223                            MachinePointerInfo(),
10224                            false, false, false, 0);
10225  return FrameAddr;
10226}
10227
10228SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
10229                                                     SelectionDAG &DAG) const {
10230  return DAG.getIntPtrConstant(2*TD->getPointerSize());
10231}
10232
10233SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
10234  SDValue Chain     = Op.getOperand(0);
10235  SDValue Offset    = Op.getOperand(1);
10236  SDValue Handler   = Op.getOperand(2);
10237  DebugLoc dl       = Op.getDebugLoc();
10238
10239  SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
10240                                     Subtarget->is64Bit() ? X86::RBP : X86::EBP,
10241                                     getPointerTy());
10242  unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
10243
10244  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame,
10245                                  DAG.getIntPtrConstant(TD->getPointerSize()));
10246  StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
10247  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
10248                       false, false, 0);
10249  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
10250
10251  return DAG.getNode(X86ISD::EH_RETURN, dl,
10252                     MVT::Other,
10253                     Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
10254}
10255
10256SDValue X86TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
10257                                                  SelectionDAG &DAG) const {
10258  return Op.getOperand(0);
10259}
10260
10261SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
10262                                                SelectionDAG &DAG) const {
10263  SDValue Root = Op.getOperand(0);
10264  SDValue Trmp = Op.getOperand(1); // trampoline
10265  SDValue FPtr = Op.getOperand(2); // nested function
10266  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
10267  DebugLoc dl  = Op.getDebugLoc();
10268
10269  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
10270
10271  if (Subtarget->is64Bit()) {
10272    SDValue OutChains[6];
10273
10274    // Large code-model.
10275    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
10276    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
10277
10278    const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10);
10279    const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11);
10280
10281    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
10282
10283    // Load the pointer to the nested function into R11.
10284    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
10285    SDValue Addr = Trmp;
10286    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
10287                                Addr, MachinePointerInfo(TrmpAddr),
10288                                false, false, 0);
10289
10290    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
10291                       DAG.getConstant(2, MVT::i64));
10292    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
10293                                MachinePointerInfo(TrmpAddr, 2),
10294                                false, false, 2);
10295
10296    // Load the 'nest' parameter value into R10.
10297    // R10 is specified in X86CallingConv.td
10298    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
10299    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
10300                       DAG.getConstant(10, MVT::i64));
10301    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
10302                                Addr, MachinePointerInfo(TrmpAddr, 10),
10303                                false, false, 0);
10304
10305    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
10306                       DAG.getConstant(12, MVT::i64));
10307    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
10308                                MachinePointerInfo(TrmpAddr, 12),
10309                                false, false, 2);
10310
10311    // Jump to the nested function.
10312    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
10313    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
10314                       DAG.getConstant(20, MVT::i64));
10315    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
10316                                Addr, MachinePointerInfo(TrmpAddr, 20),
10317                                false, false, 0);
10318
10319    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
10320    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
10321                       DAG.getConstant(22, MVT::i64));
10322    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
10323                                MachinePointerInfo(TrmpAddr, 22),
10324                                false, false, 0);
10325
10326    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
10327  } else {
10328    const Function *Func =
10329      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
10330    CallingConv::ID CC = Func->getCallingConv();
10331    unsigned NestReg;
10332
10333    switch (CC) {
10334    default:
10335      llvm_unreachable("Unsupported calling convention");
10336    case CallingConv::C:
10337    case CallingConv::X86_StdCall: {
10338      // Pass 'nest' parameter in ECX.
10339      // Must be kept in sync with X86CallingConv.td
10340      NestReg = X86::ECX;
10341
10342      // Check that ECX wasn't needed by an 'inreg' parameter.
10343      FunctionType *FTy = Func->getFunctionType();
10344      const AttrListPtr &Attrs = Func->getAttributes();
10345
10346      if (!Attrs.isEmpty() && !Func->isVarArg()) {
10347        unsigned InRegCount = 0;
10348        unsigned Idx = 1;
10349
10350        for (FunctionType::param_iterator I = FTy->param_begin(),
10351             E = FTy->param_end(); I != E; ++I, ++Idx)
10352          if (Attrs.paramHasAttr(Idx, Attribute::InReg))
10353            // FIXME: should only count parameters that are lowered to integers.
10354            InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
10355
10356        if (InRegCount > 2) {
10357          report_fatal_error("Nest register in use - reduce number of inreg"
10358                             " parameters!");
10359        }
10360      }
10361      break;
10362    }
10363    case CallingConv::X86_FastCall:
10364    case CallingConv::X86_ThisCall:
10365    case CallingConv::Fast:
10366      // Pass 'nest' parameter in EAX.
10367      // Must be kept in sync with X86CallingConv.td
10368      NestReg = X86::EAX;
10369      break;
10370    }
10371
10372    SDValue OutChains[4];
10373    SDValue Addr, Disp;
10374
10375    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
10376                       DAG.getConstant(10, MVT::i32));
10377    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
10378
10379    // This is storing the opcode for MOV32ri.
10380    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
10381    const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg);
10382    OutChains[0] = DAG.getStore(Root, dl,
10383                                DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
10384                                Trmp, MachinePointerInfo(TrmpAddr),
10385                                false, false, 0);
10386
10387    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
10388                       DAG.getConstant(1, MVT::i32));
10389    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
10390                                MachinePointerInfo(TrmpAddr, 1),
10391                                false, false, 1);
10392
10393    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
10394    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
10395                       DAG.getConstant(5, MVT::i32));
10396    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
10397                                MachinePointerInfo(TrmpAddr, 5),
10398                                false, false, 1);
10399
10400    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
10401                       DAG.getConstant(6, MVT::i32));
10402    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
10403                                MachinePointerInfo(TrmpAddr, 6),
10404                                false, false, 1);
10405
10406    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
10407  }
10408}
10409
10410SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
10411                                            SelectionDAG &DAG) const {
10412  /*
10413   The rounding mode is in bits 11:10 of FPSR, and has the following
10414   settings:
10415     00 Round to nearest
10416     01 Round to -inf
10417     10 Round to +inf
10418     11 Round to 0
10419
10420  FLT_ROUNDS, on the other hand, expects the following:
10421    -1 Undefined
10422     0 Round to 0
10423     1 Round to nearest
10424     2 Round to +inf
10425     3 Round to -inf
10426
10427  To perform the conversion, we do:
10428    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
10429  */
10430
10431  MachineFunction &MF = DAG.getMachineFunction();
10432  const TargetMachine &TM = MF.getTarget();
10433  const TargetFrameLowering &TFI = *TM.getFrameLowering();
10434  unsigned StackAlignment = TFI.getStackAlignment();
10435  EVT VT = Op.getValueType();
10436  DebugLoc DL = Op.getDebugLoc();
10437
10438  // Save FP Control Word to stack slot
10439  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
10440  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
10441
10442
10443  MachineMemOperand *MMO =
10444   MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
10445                           MachineMemOperand::MOStore, 2, 2);
10446
10447  SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
10448  SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
10449                                          DAG.getVTList(MVT::Other),
10450                                          Ops, 2, MVT::i16, MMO);
10451
10452  // Load FP Control Word from stack slot
10453  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
10454                            MachinePointerInfo(), false, false, false, 0);
10455
10456  // Transform as necessary
10457  SDValue CWD1 =
10458    DAG.getNode(ISD::SRL, DL, MVT::i16,
10459                DAG.getNode(ISD::AND, DL, MVT::i16,
10460                            CWD, DAG.getConstant(0x800, MVT::i16)),
10461                DAG.getConstant(11, MVT::i8));
10462  SDValue CWD2 =
10463    DAG.getNode(ISD::SRL, DL, MVT::i16,
10464                DAG.getNode(ISD::AND, DL, MVT::i16,
10465                            CWD, DAG.getConstant(0x400, MVT::i16)),
10466                DAG.getConstant(9, MVT::i8));
10467
10468  SDValue RetVal =
10469    DAG.getNode(ISD::AND, DL, MVT::i16,
10470                DAG.getNode(ISD::ADD, DL, MVT::i16,
10471                            DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
10472                            DAG.getConstant(1, MVT::i16)),
10473                DAG.getConstant(3, MVT::i16));
10474
10475
10476  return DAG.getNode((VT.getSizeInBits() < 16 ?
10477                      ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
10478}
10479
10480SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
10481  EVT VT = Op.getValueType();
10482  EVT OpVT = VT;
10483  unsigned NumBits = VT.getSizeInBits();
10484  DebugLoc dl = Op.getDebugLoc();
10485
10486  Op = Op.getOperand(0);
10487  if (VT == MVT::i8) {
10488    // Zero extend to i32 since there is not an i8 bsr.
10489    OpVT = MVT::i32;
10490    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
10491  }
10492
10493  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
10494  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
10495  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
10496
10497  // If src is zero (i.e. bsr sets ZF), returns NumBits.
10498  SDValue Ops[] = {
10499    Op,
10500    DAG.getConstant(NumBits+NumBits-1, OpVT),
10501    DAG.getConstant(X86::COND_E, MVT::i8),
10502    Op.getValue(1)
10503  };
10504  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
10505
10506  // Finally xor with NumBits-1.
10507  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
10508
10509  if (VT == MVT::i8)
10510    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
10511  return Op;
10512}
10513
10514SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op,
10515                                                SelectionDAG &DAG) const {
10516  EVT VT = Op.getValueType();
10517  EVT OpVT = VT;
10518  unsigned NumBits = VT.getSizeInBits();
10519  DebugLoc dl = Op.getDebugLoc();
10520
10521  Op = Op.getOperand(0);
10522  if (VT == MVT::i8) {
10523    // Zero extend to i32 since there is not an i8 bsr.
10524    OpVT = MVT::i32;
10525    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
10526  }
10527
10528  // Issue a bsr (scan bits in reverse).
10529  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
10530  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
10531
10532  // And xor with NumBits-1.
10533  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
10534
10535  if (VT == MVT::i8)
10536    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
10537  return Op;
10538}
10539
10540SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
10541  EVT VT = Op.getValueType();
10542  unsigned NumBits = VT.getSizeInBits();
10543  DebugLoc dl = Op.getDebugLoc();
10544  Op = Op.getOperand(0);
10545
10546  // Issue a bsf (scan bits forward) which also sets EFLAGS.
10547  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
10548  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
10549
10550  // If src is zero (i.e. bsf sets ZF), returns NumBits.
10551  SDValue Ops[] = {
10552    Op,
10553    DAG.getConstant(NumBits, VT),
10554    DAG.getConstant(X86::COND_E, MVT::i8),
10555    Op.getValue(1)
10556  };
10557  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops));
10558}
10559
10560// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
10561// ones, and then concatenate the result back.
10562static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
10563  EVT VT = Op.getValueType();
10564
10565  assert(VT.is256BitVector() && VT.isInteger() &&
10566         "Unsupported value type for operation");
10567
10568  unsigned NumElems = VT.getVectorNumElements();
10569  DebugLoc dl = Op.getDebugLoc();
10570
10571  // Extract the LHS vectors
10572  SDValue LHS = Op.getOperand(0);
10573  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
10574  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
10575
10576  // Extract the RHS vectors
10577  SDValue RHS = Op.getOperand(1);
10578  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
10579  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
10580
10581  MVT EltVT = VT.getVectorElementType().getSimpleVT();
10582  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
10583
10584  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
10585                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
10586                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
10587}
10588
10589SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const {
10590  assert(Op.getValueType().is256BitVector() &&
10591         Op.getValueType().isInteger() &&
10592         "Only handle AVX 256-bit vector integer operation");
10593  return Lower256IntArith(Op, DAG);
10594}
10595
10596SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const {
10597  assert(Op.getValueType().is256BitVector() &&
10598         Op.getValueType().isInteger() &&
10599         "Only handle AVX 256-bit vector integer operation");
10600  return Lower256IntArith(Op, DAG);
10601}
10602
10603SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
10604  EVT VT = Op.getValueType();
10605
10606  // Decompose 256-bit ops into smaller 128-bit ops.
10607  if (VT.is256BitVector() && !Subtarget->hasAVX2())
10608    return Lower256IntArith(Op, DAG);
10609
10610  assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
10611         "Only know how to lower V2I64/V4I64 multiply");
10612
10613  DebugLoc dl = Op.getDebugLoc();
10614
10615  //  Ahi = psrlqi(a, 32);
10616  //  Bhi = psrlqi(b, 32);
10617  //
10618  //  AloBlo = pmuludq(a, b);
10619  //  AloBhi = pmuludq(a, Bhi);
10620  //  AhiBlo = pmuludq(Ahi, b);
10621
10622  //  AloBhi = psllqi(AloBhi, 32);
10623  //  AhiBlo = psllqi(AhiBlo, 32);
10624  //  return AloBlo + AloBhi + AhiBlo;
10625
10626  SDValue A = Op.getOperand(0);
10627  SDValue B = Op.getOperand(1);
10628
10629  SDValue ShAmt = DAG.getConstant(32, MVT::i32);
10630
10631  SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
10632  SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt);
10633
10634  // Bit cast to 32-bit vectors for MULUDQ
10635  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32;
10636  A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
10637  B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
10638  Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
10639  Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
10640
10641  SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
10642  SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
10643  SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
10644
10645  AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt);
10646  AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt);
10647
10648  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
10649  return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
10650}
10651
10652SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
10653
10654  EVT VT = Op.getValueType();
10655  DebugLoc dl = Op.getDebugLoc();
10656  SDValue R = Op.getOperand(0);
10657  SDValue Amt = Op.getOperand(1);
10658  LLVMContext *Context = DAG.getContext();
10659
10660  if (!Subtarget->hasSSE2())
10661    return SDValue();
10662
10663  // Optimize shl/srl/sra with constant shift amount.
10664  if (isSplatVector(Amt.getNode())) {
10665    SDValue SclrAmt = Amt->getOperand(0);
10666    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
10667      uint64_t ShiftAmt = C->getZExtValue();
10668
10669      if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
10670          (Subtarget->hasAVX2() &&
10671           (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) {
10672        if (Op.getOpcode() == ISD::SHL)
10673          return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
10674                             DAG.getConstant(ShiftAmt, MVT::i32));
10675        if (Op.getOpcode() == ISD::SRL)
10676          return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
10677                             DAG.getConstant(ShiftAmt, MVT::i32));
10678        if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
10679          return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
10680                             DAG.getConstant(ShiftAmt, MVT::i32));
10681      }
10682
10683      if (VT == MVT::v16i8) {
10684        if (Op.getOpcode() == ISD::SHL) {
10685          // Make a large shift.
10686          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R,
10687                                    DAG.getConstant(ShiftAmt, MVT::i32));
10688          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
10689          // Zero out the rightmost bits.
10690          SmallVector<SDValue, 16> V(16,
10691                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
10692                                                     MVT::i8));
10693          return DAG.getNode(ISD::AND, dl, VT, SHL,
10694                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
10695        }
10696        if (Op.getOpcode() == ISD::SRL) {
10697          // Make a large shift.
10698          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R,
10699                                    DAG.getConstant(ShiftAmt, MVT::i32));
10700          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
10701          // Zero out the leftmost bits.
10702          SmallVector<SDValue, 16> V(16,
10703                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
10704                                                     MVT::i8));
10705          return DAG.getNode(ISD::AND, dl, VT, SRL,
10706                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
10707        }
10708        if (Op.getOpcode() == ISD::SRA) {
10709          if (ShiftAmt == 7) {
10710            // R s>> 7  ===  R s< 0
10711            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
10712            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
10713          }
10714
10715          // R s>> a === ((R u>> a) ^ m) - m
10716          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
10717          SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
10718                                                         MVT::i8));
10719          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
10720          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
10721          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
10722          return Res;
10723        }
10724        llvm_unreachable("Unknown shift opcode.");
10725      }
10726
10727      if (Subtarget->hasAVX2() && VT == MVT::v32i8) {
10728        if (Op.getOpcode() == ISD::SHL) {
10729          // Make a large shift.
10730          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R,
10731                                    DAG.getConstant(ShiftAmt, MVT::i32));
10732          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
10733          // Zero out the rightmost bits.
10734          SmallVector<SDValue, 32> V(32,
10735                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
10736                                                     MVT::i8));
10737          return DAG.getNode(ISD::AND, dl, VT, SHL,
10738                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
10739        }
10740        if (Op.getOpcode() == ISD::SRL) {
10741          // Make a large shift.
10742          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R,
10743                                    DAG.getConstant(ShiftAmt, MVT::i32));
10744          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
10745          // Zero out the leftmost bits.
10746          SmallVector<SDValue, 32> V(32,
10747                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
10748                                                     MVT::i8));
10749          return DAG.getNode(ISD::AND, dl, VT, SRL,
10750                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
10751        }
10752        if (Op.getOpcode() == ISD::SRA) {
10753          if (ShiftAmt == 7) {
10754            // R s>> 7  ===  R s< 0
10755            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
10756            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
10757          }
10758
10759          // R s>> a === ((R u>> a) ^ m) - m
10760          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
10761          SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
10762                                                         MVT::i8));
10763          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
10764          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
10765          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
10766          return Res;
10767        }
10768        llvm_unreachable("Unknown shift opcode.");
10769      }
10770    }
10771  }
10772
10773  // Lower SHL with variable shift amount.
10774  if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
10775    Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1),
10776                     DAG.getConstant(23, MVT::i32));
10777
10778    const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U};
10779    Constant *C = ConstantDataVector::get(*Context, CV);
10780    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
10781    SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
10782                                 MachinePointerInfo::getConstantPool(),
10783                                 false, false, false, 16);
10784
10785    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend);
10786    Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
10787    Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
10788    return DAG.getNode(ISD::MUL, dl, VT, Op, R);
10789  }
10790  if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
10791    assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
10792
10793    // a = a << 5;
10794    Op = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, Op.getOperand(1),
10795                     DAG.getConstant(5, MVT::i32));
10796    Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
10797
10798    // Turn 'a' into a mask suitable for VSELECT
10799    SDValue VSelM = DAG.getConstant(0x80, VT);
10800    SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
10801    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
10802
10803    SDValue CM1 = DAG.getConstant(0x0f, VT);
10804    SDValue CM2 = DAG.getConstant(0x3f, VT);
10805
10806    // r = VSELECT(r, psllw(r & (char16)15, 4), a);
10807    SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
10808    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
10809                            DAG.getConstant(4, MVT::i32), DAG);
10810    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
10811    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
10812
10813    // a += a
10814    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
10815    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
10816    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
10817
10818    // r = VSELECT(r, psllw(r & (char16)63, 2), a);
10819    M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
10820    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
10821                            DAG.getConstant(2, MVT::i32), DAG);
10822    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
10823    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
10824
10825    // a += a
10826    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
10827    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
10828    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
10829
10830    // return VSELECT(r, r+r, a);
10831    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
10832                    DAG.getNode(ISD::ADD, dl, VT, R, R), R);
10833    return R;
10834  }
10835
10836  // Decompose 256-bit shifts into smaller 128-bit shifts.
10837  if (VT.is256BitVector()) {
10838    unsigned NumElems = VT.getVectorNumElements();
10839    MVT EltVT = VT.getVectorElementType().getSimpleVT();
10840    EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
10841
10842    // Extract the two vectors
10843    SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
10844    SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
10845
10846    // Recreate the shift amount vectors
10847    SDValue Amt1, Amt2;
10848    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
10849      // Constant shift amount
10850      SmallVector<SDValue, 4> Amt1Csts;
10851      SmallVector<SDValue, 4> Amt2Csts;
10852      for (unsigned i = 0; i != NumElems/2; ++i)
10853        Amt1Csts.push_back(Amt->getOperand(i));
10854      for (unsigned i = NumElems/2; i != NumElems; ++i)
10855        Amt2Csts.push_back(Amt->getOperand(i));
10856
10857      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
10858                                 &Amt1Csts[0], NumElems/2);
10859      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
10860                                 &Amt2Csts[0], NumElems/2);
10861    } else {
10862      // Variable shift amount
10863      Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
10864      Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
10865    }
10866
10867    // Issue new vector shifts for the smaller types
10868    V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
10869    V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
10870
10871    // Concatenate the result back
10872    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
10873  }
10874
10875  return SDValue();
10876}
10877
10878SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
10879  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
10880  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
10881  // looks for this combo and may remove the "setcc" instruction if the "setcc"
10882  // has only one use.
10883  SDNode *N = Op.getNode();
10884  SDValue LHS = N->getOperand(0);
10885  SDValue RHS = N->getOperand(1);
10886  unsigned BaseOp = 0;
10887  unsigned Cond = 0;
10888  DebugLoc DL = Op.getDebugLoc();
10889  switch (Op.getOpcode()) {
10890  default: llvm_unreachable("Unknown ovf instruction!");
10891  case ISD::SADDO:
10892    // A subtract of one will be selected as a INC. Note that INC doesn't
10893    // set CF, so we can't do this for UADDO.
10894    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
10895      if (C->isOne()) {
10896        BaseOp = X86ISD::INC;
10897        Cond = X86::COND_O;
10898        break;
10899      }
10900    BaseOp = X86ISD::ADD;
10901    Cond = X86::COND_O;
10902    break;
10903  case ISD::UADDO:
10904    BaseOp = X86ISD::ADD;
10905    Cond = X86::COND_B;
10906    break;
10907  case ISD::SSUBO:
10908    // A subtract of one will be selected as a DEC. Note that DEC doesn't
10909    // set CF, so we can't do this for USUBO.
10910    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
10911      if (C->isOne()) {
10912        BaseOp = X86ISD::DEC;
10913        Cond = X86::COND_O;
10914        break;
10915      }
10916    BaseOp = X86ISD::SUB;
10917    Cond = X86::COND_O;
10918    break;
10919  case ISD::USUBO:
10920    BaseOp = X86ISD::SUB;
10921    Cond = X86::COND_B;
10922    break;
10923  case ISD::SMULO:
10924    BaseOp = X86ISD::SMUL;
10925    Cond = X86::COND_O;
10926    break;
10927  case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
10928    SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
10929                                 MVT::i32);
10930    SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
10931
10932    SDValue SetCC =
10933      DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
10934                  DAG.getConstant(X86::COND_O, MVT::i32),
10935                  SDValue(Sum.getNode(), 2));
10936
10937    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
10938  }
10939  }
10940
10941  // Also sets EFLAGS.
10942  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
10943  SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
10944
10945  SDValue SetCC =
10946    DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
10947                DAG.getConstant(Cond, MVT::i32),
10948                SDValue(Sum.getNode(), 1));
10949
10950  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
10951}
10952
10953SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
10954                                                  SelectionDAG &DAG) const {
10955  DebugLoc dl = Op.getDebugLoc();
10956  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
10957  EVT VT = Op.getValueType();
10958
10959  if (!Subtarget->hasSSE2() || !VT.isVector())
10960    return SDValue();
10961
10962  unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
10963                      ExtraVT.getScalarType().getSizeInBits();
10964  SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
10965
10966  switch (VT.getSimpleVT().SimpleTy) {
10967    default: return SDValue();
10968    case MVT::v8i32:
10969    case MVT::v16i16:
10970      if (!Subtarget->hasAVX())
10971        return SDValue();
10972      if (!Subtarget->hasAVX2()) {
10973        // needs to be split
10974        unsigned NumElems = VT.getVectorNumElements();
10975
10976        // Extract the LHS vectors
10977        SDValue LHS = Op.getOperand(0);
10978        SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
10979        SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
10980
10981        MVT EltVT = VT.getVectorElementType().getSimpleVT();
10982        EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
10983
10984        EVT ExtraEltVT = ExtraVT.getVectorElementType();
10985        unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
10986        ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
10987                                   ExtraNumElems/2);
10988        SDValue Extra = DAG.getValueType(ExtraVT);
10989
10990        LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
10991        LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
10992
10993        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);;
10994      }
10995      // fall through
10996    case MVT::v4i32:
10997    case MVT::v8i16: {
10998      SDValue Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT,
10999                                         Op.getOperand(0), ShAmt, DAG);
11000      return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG);
11001    }
11002  }
11003}
11004
11005
11006SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
11007  DebugLoc dl = Op.getDebugLoc();
11008
11009  // Go ahead and emit the fence on x86-64 even if we asked for no-sse2.
11010  // There isn't any reason to disable it if the target processor supports it.
11011  if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) {
11012    SDValue Chain = Op.getOperand(0);
11013    SDValue Zero = DAG.getConstant(0, MVT::i32);
11014    SDValue Ops[] = {
11015      DAG.getRegister(X86::ESP, MVT::i32), // Base
11016      DAG.getTargetConstant(1, MVT::i8),   // Scale
11017      DAG.getRegister(0, MVT::i32),        // Index
11018      DAG.getTargetConstant(0, MVT::i32),  // Disp
11019      DAG.getRegister(0, MVT::i32),        // Segment.
11020      Zero,
11021      Chain
11022    };
11023    SDNode *Res =
11024      DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
11025                          array_lengthof(Ops));
11026    return SDValue(Res, 0);
11027  }
11028
11029  unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
11030  if (!isDev)
11031    return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
11032
11033  unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
11034  unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
11035  unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
11036  unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
11037
11038  // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
11039  if (!Op1 && !Op2 && !Op3 && Op4)
11040    return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0));
11041
11042  // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
11043  if (Op1 && !Op2 && !Op3 && !Op4)
11044    return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0));
11045
11046  // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)),
11047  //           (MFENCE)>;
11048  return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
11049}
11050
11051SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op,
11052                                             SelectionDAG &DAG) const {
11053  DebugLoc dl = Op.getDebugLoc();
11054  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
11055    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
11056  SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
11057    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
11058
11059  // The only fence that needs an instruction is a sequentially-consistent
11060  // cross-thread fence.
11061  if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
11062    // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
11063    // no-sse2). There isn't any reason to disable it if the target processor
11064    // supports it.
11065    if (Subtarget->hasSSE2() || Subtarget->is64Bit())
11066      return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
11067
11068    SDValue Chain = Op.getOperand(0);
11069    SDValue Zero = DAG.getConstant(0, MVT::i32);
11070    SDValue Ops[] = {
11071      DAG.getRegister(X86::ESP, MVT::i32), // Base
11072      DAG.getTargetConstant(1, MVT::i8),   // Scale
11073      DAG.getRegister(0, MVT::i32),        // Index
11074      DAG.getTargetConstant(0, MVT::i32),  // Disp
11075      DAG.getRegister(0, MVT::i32),        // Segment.
11076      Zero,
11077      Chain
11078    };
11079    SDNode *Res =
11080      DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
11081                         array_lengthof(Ops));
11082    return SDValue(Res, 0);
11083  }
11084
11085  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
11086  return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
11087}
11088
11089
11090SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
11091  EVT T = Op.getValueType();
11092  DebugLoc DL = Op.getDebugLoc();
11093  unsigned Reg = 0;
11094  unsigned size = 0;
11095  switch(T.getSimpleVT().SimpleTy) {
11096  default: llvm_unreachable("Invalid value type!");
11097  case MVT::i8:  Reg = X86::AL;  size = 1; break;
11098  case MVT::i16: Reg = X86::AX;  size = 2; break;
11099  case MVT::i32: Reg = X86::EAX; size = 4; break;
11100  case MVT::i64:
11101    assert(Subtarget->is64Bit() && "Node not type legal!");
11102    Reg = X86::RAX; size = 8;
11103    break;
11104  }
11105  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
11106                                    Op.getOperand(2), SDValue());
11107  SDValue Ops[] = { cpIn.getValue(0),
11108                    Op.getOperand(1),
11109                    Op.getOperand(3),
11110                    DAG.getTargetConstant(size, MVT::i8),
11111                    cpIn.getValue(1) };
11112  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
11113  MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
11114  SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
11115                                           Ops, 5, T, MMO);
11116  SDValue cpOut =
11117    DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
11118  return cpOut;
11119}
11120
11121SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
11122                                                 SelectionDAG &DAG) const {
11123  assert(Subtarget->is64Bit() && "Result not type legalized?");
11124  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
11125  SDValue TheChain = Op.getOperand(0);
11126  DebugLoc dl = Op.getDebugLoc();
11127  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
11128  SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
11129  SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
11130                                   rax.getValue(2));
11131  SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
11132                            DAG.getConstant(32, MVT::i8));
11133  SDValue Ops[] = {
11134    DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
11135    rdx.getValue(1)
11136  };
11137  return DAG.getMergeValues(Ops, 2, dl);
11138}
11139
11140SDValue X86TargetLowering::LowerBITCAST(SDValue Op,
11141                                            SelectionDAG &DAG) const {
11142  EVT SrcVT = Op.getOperand(0).getValueType();
11143  EVT DstVT = Op.getValueType();
11144  assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
11145         Subtarget->hasMMX() && "Unexpected custom BITCAST");
11146  assert((DstVT == MVT::i64 ||
11147          (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
11148         "Unexpected custom BITCAST");
11149  // i64 <=> MMX conversions are Legal.
11150  if (SrcVT==MVT::i64 && DstVT.isVector())
11151    return Op;
11152  if (DstVT==MVT::i64 && SrcVT.isVector())
11153    return Op;
11154  // MMX <=> MMX conversions are Legal.
11155  if (SrcVT.isVector() && DstVT.isVector())
11156    return Op;
11157  // All other conversions need to be expanded.
11158  return SDValue();
11159}
11160
11161SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const {
11162  SDNode *Node = Op.getNode();
11163  DebugLoc dl = Node->getDebugLoc();
11164  EVT T = Node->getValueType(0);
11165  SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
11166                              DAG.getConstant(0, T), Node->getOperand(2));
11167  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
11168                       cast<AtomicSDNode>(Node)->getMemoryVT(),
11169                       Node->getOperand(0),
11170                       Node->getOperand(1), negOp,
11171                       cast<AtomicSDNode>(Node)->getSrcValue(),
11172                       cast<AtomicSDNode>(Node)->getAlignment(),
11173                       cast<AtomicSDNode>(Node)->getOrdering(),
11174                       cast<AtomicSDNode>(Node)->getSynchScope());
11175}
11176
11177static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
11178  SDNode *Node = Op.getNode();
11179  DebugLoc dl = Node->getDebugLoc();
11180  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
11181
11182  // Convert seq_cst store -> xchg
11183  // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
11184  // FIXME: On 32-bit, store -> fist or movq would be more efficient
11185  //        (The only way to get a 16-byte store is cmpxchg16b)
11186  // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
11187  if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
11188      !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
11189    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
11190                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
11191                                 Node->getOperand(0),
11192                                 Node->getOperand(1), Node->getOperand(2),
11193                                 cast<AtomicSDNode>(Node)->getMemOperand(),
11194                                 cast<AtomicSDNode>(Node)->getOrdering(),
11195                                 cast<AtomicSDNode>(Node)->getSynchScope());
11196    return Swap.getValue(1);
11197  }
11198  // Other atomic stores have a simple pattern.
11199  return Op;
11200}
11201
11202static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
11203  EVT VT = Op.getNode()->getValueType(0);
11204
11205  // Let legalize expand this if it isn't a legal type yet.
11206  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
11207    return SDValue();
11208
11209  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
11210
11211  unsigned Opc;
11212  bool ExtraOp = false;
11213  switch (Op.getOpcode()) {
11214  default: llvm_unreachable("Invalid code");
11215  case ISD::ADDC: Opc = X86ISD::ADD; break;
11216  case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
11217  case ISD::SUBC: Opc = X86ISD::SUB; break;
11218  case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
11219  }
11220
11221  if (!ExtraOp)
11222    return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
11223                       Op.getOperand(1));
11224  return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
11225                     Op.getOperand(1), Op.getOperand(2));
11226}
11227
11228/// LowerOperation - Provide custom lowering hooks for some operations.
11229///
11230SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
11231  switch (Op.getOpcode()) {
11232  default: llvm_unreachable("Should not custom lower this!");
11233  case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
11234  case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op,DAG);
11235  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op,DAG);
11236  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
11237  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
11238  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
11239  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
11240  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
11241  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
11242  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
11243  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
11244  case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op, DAG);
11245  case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, DAG);
11246  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
11247  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
11248  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
11249  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
11250  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
11251  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
11252  case ISD::SHL_PARTS:
11253  case ISD::SRA_PARTS:
11254  case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
11255  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
11256  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
11257  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
11258  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
11259  case ISD::FABS:               return LowerFABS(Op, DAG);
11260  case ISD::FNEG:               return LowerFNEG(Op, DAG);
11261  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
11262  case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
11263  case ISD::SETCC:              return LowerSETCC(Op, DAG);
11264  case ISD::SELECT:             return LowerSELECT(Op, DAG);
11265  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
11266  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
11267  case ISD::VASTART:            return LowerVASTART(Op, DAG);
11268  case ISD::VAARG:              return LowerVAARG(Op, DAG);
11269  case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
11270  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
11271  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, DAG);
11272  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
11273  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
11274  case ISD::FRAME_TO_ARGS_OFFSET:
11275                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
11276  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
11277  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
11278  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
11279  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
11280  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
11281  case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
11282  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
11283  case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
11284  case ISD::MUL:                return LowerMUL(Op, DAG);
11285  case ISD::SRA:
11286  case ISD::SRL:
11287  case ISD::SHL:                return LowerShift(Op, DAG);
11288  case ISD::SADDO:
11289  case ISD::UADDO:
11290  case ISD::SSUBO:
11291  case ISD::USUBO:
11292  case ISD::SMULO:
11293  case ISD::UMULO:              return LowerXALUO(Op, DAG);
11294  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, DAG);
11295  case ISD::BITCAST:            return LowerBITCAST(Op, DAG);
11296  case ISD::ADDC:
11297  case ISD::ADDE:
11298  case ISD::SUBC:
11299  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
11300  case ISD::ADD:                return LowerADD(Op, DAG);
11301  case ISD::SUB:                return LowerSUB(Op, DAG);
11302  }
11303}
11304
11305static void ReplaceATOMIC_LOAD(SDNode *Node,
11306                                  SmallVectorImpl<SDValue> &Results,
11307                                  SelectionDAG &DAG) {
11308  DebugLoc dl = Node->getDebugLoc();
11309  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
11310
11311  // Convert wide load -> cmpxchg8b/cmpxchg16b
11312  // FIXME: On 32-bit, load -> fild or movq would be more efficient
11313  //        (The only way to get a 16-byte load is cmpxchg16b)
11314  // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
11315  SDValue Zero = DAG.getConstant(0, VT);
11316  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
11317                               Node->getOperand(0),
11318                               Node->getOperand(1), Zero, Zero,
11319                               cast<AtomicSDNode>(Node)->getMemOperand(),
11320                               cast<AtomicSDNode>(Node)->getOrdering(),
11321                               cast<AtomicSDNode>(Node)->getSynchScope());
11322  Results.push_back(Swap.getValue(0));
11323  Results.push_back(Swap.getValue(1));
11324}
11325
11326static void
11327ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
11328                        SelectionDAG &DAG, unsigned NewOp) {
11329  DebugLoc dl = Node->getDebugLoc();
11330  assert (Node->getValueType(0) == MVT::i64 &&
11331          "Only know how to expand i64 atomics");
11332
11333  SDValue Chain = Node->getOperand(0);
11334  SDValue In1 = Node->getOperand(1);
11335  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
11336                             Node->getOperand(2), DAG.getIntPtrConstant(0));
11337  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
11338                             Node->getOperand(2), DAG.getIntPtrConstant(1));
11339  SDValue Ops[] = { Chain, In1, In2L, In2H };
11340  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
11341  SDValue Result =
11342    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64,
11343                            cast<MemSDNode>(Node)->getMemOperand());
11344  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
11345  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
11346  Results.push_back(Result.getValue(2));
11347}
11348
11349/// ReplaceNodeResults - Replace a node with an illegal result type
11350/// with a new node built out of custom code.
11351void X86TargetLowering::ReplaceNodeResults(SDNode *N,
11352                                           SmallVectorImpl<SDValue>&Results,
11353                                           SelectionDAG &DAG) const {
11354  DebugLoc dl = N->getDebugLoc();
11355  switch (N->getOpcode()) {
11356  default:
11357    llvm_unreachable("Do not know how to custom type legalize this operation!");
11358  case ISD::SIGN_EXTEND_INREG:
11359  case ISD::ADDC:
11360  case ISD::ADDE:
11361  case ISD::SUBC:
11362  case ISD::SUBE:
11363    // We don't want to expand or promote these.
11364    return;
11365  case ISD::FP_TO_SINT:
11366  case ISD::FP_TO_UINT: {
11367    bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
11368
11369    if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
11370      return;
11371
11372    std::pair<SDValue,SDValue> Vals =
11373        FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
11374    SDValue FIST = Vals.first, StackSlot = Vals.second;
11375    if (FIST.getNode() != 0) {
11376      EVT VT = N->getValueType(0);
11377      // Return a load from the stack slot.
11378      if (StackSlot.getNode() != 0)
11379        Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
11380                                      MachinePointerInfo(),
11381                                      false, false, false, 0));
11382      else
11383        Results.push_back(FIST);
11384    }
11385    return;
11386  }
11387  case ISD::READCYCLECOUNTER: {
11388    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
11389    SDValue TheChain = N->getOperand(0);
11390    SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
11391    SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
11392                                     rd.getValue(1));
11393    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
11394                                     eax.getValue(2));
11395    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
11396    SDValue Ops[] = { eax, edx };
11397    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
11398    Results.push_back(edx.getValue(1));
11399    return;
11400  }
11401  case ISD::ATOMIC_CMP_SWAP: {
11402    EVT T = N->getValueType(0);
11403    assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
11404    bool Regs64bit = T == MVT::i128;
11405    EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
11406    SDValue cpInL, cpInH;
11407    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
11408                        DAG.getConstant(0, HalfT));
11409    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
11410                        DAG.getConstant(1, HalfT));
11411    cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
11412                             Regs64bit ? X86::RAX : X86::EAX,
11413                             cpInL, SDValue());
11414    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
11415                             Regs64bit ? X86::RDX : X86::EDX,
11416                             cpInH, cpInL.getValue(1));
11417    SDValue swapInL, swapInH;
11418    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
11419                          DAG.getConstant(0, HalfT));
11420    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
11421                          DAG.getConstant(1, HalfT));
11422    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
11423                               Regs64bit ? X86::RBX : X86::EBX,
11424                               swapInL, cpInH.getValue(1));
11425    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
11426                               Regs64bit ? X86::RCX : X86::ECX,
11427                               swapInH, swapInL.getValue(1));
11428    SDValue Ops[] = { swapInH.getValue(0),
11429                      N->getOperand(1),
11430                      swapInH.getValue(1) };
11431    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
11432    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
11433    unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
11434                                  X86ISD::LCMPXCHG8_DAG;
11435    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
11436                                             Ops, 3, T, MMO);
11437    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
11438                                        Regs64bit ? X86::RAX : X86::EAX,
11439                                        HalfT, Result.getValue(1));
11440    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
11441                                        Regs64bit ? X86::RDX : X86::EDX,
11442                                        HalfT, cpOutL.getValue(2));
11443    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
11444    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
11445    Results.push_back(cpOutH.getValue(1));
11446    return;
11447  }
11448  case ISD::ATOMIC_LOAD_ADD:
11449  case ISD::ATOMIC_LOAD_AND:
11450  case ISD::ATOMIC_LOAD_NAND:
11451  case ISD::ATOMIC_LOAD_OR:
11452  case ISD::ATOMIC_LOAD_SUB:
11453  case ISD::ATOMIC_LOAD_XOR:
11454  case ISD::ATOMIC_SWAP: {
11455    unsigned Opc;
11456    switch (N->getOpcode()) {
11457    default: llvm_unreachable("Unexpected opcode");
11458    case ISD::ATOMIC_LOAD_ADD:
11459      Opc = X86ISD::ATOMADD64_DAG;
11460      break;
11461    case ISD::ATOMIC_LOAD_AND:
11462      Opc = X86ISD::ATOMAND64_DAG;
11463      break;
11464    case ISD::ATOMIC_LOAD_NAND:
11465      Opc = X86ISD::ATOMNAND64_DAG;
11466      break;
11467    case ISD::ATOMIC_LOAD_OR:
11468      Opc = X86ISD::ATOMOR64_DAG;
11469      break;
11470    case ISD::ATOMIC_LOAD_SUB:
11471      Opc = X86ISD::ATOMSUB64_DAG;
11472      break;
11473    case ISD::ATOMIC_LOAD_XOR:
11474      Opc = X86ISD::ATOMXOR64_DAG;
11475      break;
11476    case ISD::ATOMIC_SWAP:
11477      Opc = X86ISD::ATOMSWAP64_DAG;
11478      break;
11479    }
11480    ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
11481    return;
11482  }
11483  case ISD::ATOMIC_LOAD:
11484    ReplaceATOMIC_LOAD(N, Results, DAG);
11485  }
11486}
11487
11488const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
11489  switch (Opcode) {
11490  default: return NULL;
11491  case X86ISD::BSF:                return "X86ISD::BSF";
11492  case X86ISD::BSR:                return "X86ISD::BSR";
11493  case X86ISD::SHLD:               return "X86ISD::SHLD";
11494  case X86ISD::SHRD:               return "X86ISD::SHRD";
11495  case X86ISD::FAND:               return "X86ISD::FAND";
11496  case X86ISD::FOR:                return "X86ISD::FOR";
11497  case X86ISD::FXOR:               return "X86ISD::FXOR";
11498  case X86ISD::FSRL:               return "X86ISD::FSRL";
11499  case X86ISD::FILD:               return "X86ISD::FILD";
11500  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
11501  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
11502  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
11503  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
11504  case X86ISD::FLD:                return "X86ISD::FLD";
11505  case X86ISD::FST:                return "X86ISD::FST";
11506  case X86ISD::CALL:               return "X86ISD::CALL";
11507  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
11508  case X86ISD::BT:                 return "X86ISD::BT";
11509  case X86ISD::CMP:                return "X86ISD::CMP";
11510  case X86ISD::COMI:               return "X86ISD::COMI";
11511  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
11512  case X86ISD::SETCC:              return "X86ISD::SETCC";
11513  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
11514  case X86ISD::FSETCCsd:           return "X86ISD::FSETCCsd";
11515  case X86ISD::FSETCCss:           return "X86ISD::FSETCCss";
11516  case X86ISD::CMOV:               return "X86ISD::CMOV";
11517  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
11518  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
11519  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
11520  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
11521  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
11522  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
11523  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
11524  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
11525  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
11526  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
11527  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
11528  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
11529  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
11530  case X86ISD::ANDNP:              return "X86ISD::ANDNP";
11531  case X86ISD::PSIGN:              return "X86ISD::PSIGN";
11532  case X86ISD::BLENDV:             return "X86ISD::BLENDV";
11533  case X86ISD::BLENDPW:            return "X86ISD::BLENDPW";
11534  case X86ISD::BLENDPS:            return "X86ISD::BLENDPS";
11535  case X86ISD::BLENDPD:            return "X86ISD::BLENDPD";
11536  case X86ISD::HADD:               return "X86ISD::HADD";
11537  case X86ISD::HSUB:               return "X86ISD::HSUB";
11538  case X86ISD::FHADD:              return "X86ISD::FHADD";
11539  case X86ISD::FHSUB:              return "X86ISD::FHSUB";
11540  case X86ISD::FMAX:               return "X86ISD::FMAX";
11541  case X86ISD::FMIN:               return "X86ISD::FMIN";
11542  case X86ISD::FMAXC:              return "X86ISD::FMAXC";
11543  case X86ISD::FMINC:              return "X86ISD::FMINC";
11544  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
11545  case X86ISD::FRCP:               return "X86ISD::FRCP";
11546  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
11547  case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
11548  case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
11549  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
11550  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
11551  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
11552  case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
11553  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
11554  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
11555  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
11556  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
11557  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
11558  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
11559  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
11560  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
11561  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
11562  case X86ISD::VSEXT_MOVL:         return "X86ISD::VSEXT_MOVL";
11563  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
11564  case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
11565  case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
11566  case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
11567  case X86ISD::VSHL:               return "X86ISD::VSHL";
11568  case X86ISD::VSRL:               return "X86ISD::VSRL";
11569  case X86ISD::VSRA:               return "X86ISD::VSRA";
11570  case X86ISD::VSHLI:              return "X86ISD::VSHLI";
11571  case X86ISD::VSRLI:              return "X86ISD::VSRLI";
11572  case X86ISD::VSRAI:              return "X86ISD::VSRAI";
11573  case X86ISD::CMPP:               return "X86ISD::CMPP";
11574  case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
11575  case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
11576  case X86ISD::ADD:                return "X86ISD::ADD";
11577  case X86ISD::SUB:                return "X86ISD::SUB";
11578  case X86ISD::ADC:                return "X86ISD::ADC";
11579  case X86ISD::SBB:                return "X86ISD::SBB";
11580  case X86ISD::SMUL:               return "X86ISD::SMUL";
11581  case X86ISD::UMUL:               return "X86ISD::UMUL";
11582  case X86ISD::INC:                return "X86ISD::INC";
11583  case X86ISD::DEC:                return "X86ISD::DEC";
11584  case X86ISD::OR:                 return "X86ISD::OR";
11585  case X86ISD::XOR:                return "X86ISD::XOR";
11586  case X86ISD::AND:                return "X86ISD::AND";
11587  case X86ISD::ANDN:               return "X86ISD::ANDN";
11588  case X86ISD::BLSI:               return "X86ISD::BLSI";
11589  case X86ISD::BLSMSK:             return "X86ISD::BLSMSK";
11590  case X86ISD::BLSR:               return "X86ISD::BLSR";
11591  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
11592  case X86ISD::PTEST:              return "X86ISD::PTEST";
11593  case X86ISD::TESTP:              return "X86ISD::TESTP";
11594  case X86ISD::PALIGN:             return "X86ISD::PALIGN";
11595  case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
11596  case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
11597  case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
11598  case X86ISD::SHUFP:              return "X86ISD::SHUFP";
11599  case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
11600  case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
11601  case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
11602  case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
11603  case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
11604  case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
11605  case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
11606  case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
11607  case X86ISD::MOVSD:              return "X86ISD::MOVSD";
11608  case X86ISD::MOVSS:              return "X86ISD::MOVSS";
11609  case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
11610  case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
11611  case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
11612  case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
11613  case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
11614  case X86ISD::VPERMV:             return "X86ISD::VPERMV";
11615  case X86ISD::VPERMI:             return "X86ISD::VPERMI";
11616  case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
11617  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
11618  case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
11619  case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
11620  case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
11621  case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
11622  case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
11623  case X86ISD::SAHF:               return "X86ISD::SAHF";
11624  case X86ISD::RDRAND:             return "X86ISD::RDRAND";
11625  case X86ISD::FMADD:              return "X86ISD::FMADD";
11626  case X86ISD::FMSUB:              return "X86ISD::FMSUB";
11627  case X86ISD::FNMADD:             return "X86ISD::FNMADD";
11628  case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
11629  case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
11630  case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
11631  }
11632}
11633
11634// isLegalAddressingMode - Return true if the addressing mode represented
11635// by AM is legal for this target, for a load/store of the specified type.
11636bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
11637                                              Type *Ty) const {
11638  // X86 supports extremely general addressing modes.
11639  CodeModel::Model M = getTargetMachine().getCodeModel();
11640  Reloc::Model R = getTargetMachine().getRelocationModel();
11641
11642  // X86 allows a sign-extended 32-bit immediate field as a displacement.
11643  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
11644    return false;
11645
11646  if (AM.BaseGV) {
11647    unsigned GVFlags =
11648      Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
11649
11650    // If a reference to this global requires an extra load, we can't fold it.
11651    if (isGlobalStubReference(GVFlags))
11652      return false;
11653
11654    // If BaseGV requires a register for the PIC base, we cannot also have a
11655    // BaseReg specified.
11656    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
11657      return false;
11658
11659    // If lower 4G is not available, then we must use rip-relative addressing.
11660    if ((M != CodeModel::Small || R != Reloc::Static) &&
11661        Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
11662      return false;
11663  }
11664
11665  switch (AM.Scale) {
11666  case 0:
11667  case 1:
11668  case 2:
11669  case 4:
11670  case 8:
11671    // These scales always work.
11672    break;
11673  case 3:
11674  case 5:
11675  case 9:
11676    // These scales are formed with basereg+scalereg.  Only accept if there is
11677    // no basereg yet.
11678    if (AM.HasBaseReg)
11679      return false;
11680    break;
11681  default:  // Other stuff never works.
11682    return false;
11683  }
11684
11685  return true;
11686}
11687
11688
11689bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
11690  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
11691    return false;
11692  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
11693  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
11694  if (NumBits1 <= NumBits2)
11695    return false;
11696  return true;
11697}
11698
11699bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
11700  return Imm == (int32_t)Imm;
11701}
11702
11703bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
11704  // Can also use sub to handle negated immediates.
11705  return Imm == (int32_t)Imm;
11706}
11707
11708bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
11709  if (!VT1.isInteger() || !VT2.isInteger())
11710    return false;
11711  unsigned NumBits1 = VT1.getSizeInBits();
11712  unsigned NumBits2 = VT2.getSizeInBits();
11713  if (NumBits1 <= NumBits2)
11714    return false;
11715  return true;
11716}
11717
11718bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
11719  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
11720  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
11721}
11722
11723bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
11724  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
11725  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
11726}
11727
11728bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
11729  // i16 instructions are longer (0x66 prefix) and potentially slower.
11730  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
11731}
11732
11733/// isShuffleMaskLegal - Targets can use this to indicate that they only
11734/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
11735/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
11736/// are assumed to be legal.
11737bool
11738X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
11739                                      EVT VT) const {
11740  // Very little shuffling can be done for 64-bit vectors right now.
11741  if (VT.getSizeInBits() == 64)
11742    return false;
11743
11744  // FIXME: pshufb, blends, shifts.
11745  return (VT.getVectorNumElements() == 2 ||
11746          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
11747          isMOVLMask(M, VT) ||
11748          isSHUFPMask(M, VT, Subtarget->hasAVX()) ||
11749          isPSHUFDMask(M, VT) ||
11750          isPSHUFHWMask(M, VT, Subtarget->hasAVX2()) ||
11751          isPSHUFLWMask(M, VT, Subtarget->hasAVX2()) ||
11752          isPALIGNRMask(M, VT, Subtarget) ||
11753          isUNPCKLMask(M, VT, Subtarget->hasAVX2()) ||
11754          isUNPCKHMask(M, VT, Subtarget->hasAVX2()) ||
11755          isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasAVX2()) ||
11756          isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasAVX2()));
11757}
11758
11759bool
11760X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
11761                                          EVT VT) const {
11762  unsigned NumElts = VT.getVectorNumElements();
11763  // FIXME: This collection of masks seems suspect.
11764  if (NumElts == 2)
11765    return true;
11766  if (NumElts == 4 && VT.is128BitVector()) {
11767    return (isMOVLMask(Mask, VT)  ||
11768            isCommutedMOVLMask(Mask, VT, true) ||
11769            isSHUFPMask(Mask, VT, Subtarget->hasAVX()) ||
11770            isSHUFPMask(Mask, VT, Subtarget->hasAVX(), /* Commuted */ true));
11771  }
11772  return false;
11773}
11774
11775//===----------------------------------------------------------------------===//
11776//                           X86 Scheduler Hooks
11777//===----------------------------------------------------------------------===//
11778
11779// private utility function
11780MachineBasicBlock *
11781X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
11782                                                       MachineBasicBlock *MBB,
11783                                                       unsigned regOpc,
11784                                                       unsigned immOpc,
11785                                                       unsigned LoadOpc,
11786                                                       unsigned CXchgOpc,
11787                                                       unsigned notOpc,
11788                                                       unsigned EAXreg,
11789                                                 const TargetRegisterClass *RC,
11790                                                       bool Invert) const {
11791  // For the atomic bitwise operator, we generate
11792  //   thisMBB:
11793  //   newMBB:
11794  //     ld  t1 = [bitinstr.addr]
11795  //     op  t2 = t1, [bitinstr.val]
11796  //     not t3 = t2  (if Invert)
11797  //     mov EAX = t1
11798  //     lcs dest = [bitinstr.addr], t3  [EAX is implicit]
11799  //     bz  newMBB
11800  //     fallthrough -->nextMBB
11801  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
11802  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
11803  MachineFunction::iterator MBBIter = MBB;
11804  ++MBBIter;
11805
11806  /// First build the CFG
11807  MachineFunction *F = MBB->getParent();
11808  MachineBasicBlock *thisMBB = MBB;
11809  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
11810  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
11811  F->insert(MBBIter, newMBB);
11812  F->insert(MBBIter, nextMBB);
11813
11814  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
11815  nextMBB->splice(nextMBB->begin(), thisMBB,
11816                  llvm::next(MachineBasicBlock::iterator(bInstr)),
11817                  thisMBB->end());
11818  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
11819
11820  // Update thisMBB to fall through to newMBB
11821  thisMBB->addSuccessor(newMBB);
11822
11823  // newMBB jumps to itself and fall through to nextMBB
11824  newMBB->addSuccessor(nextMBB);
11825  newMBB->addSuccessor(newMBB);
11826
11827  // Insert instructions into newMBB based on incoming instruction
11828  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
11829         "unexpected number of operands");
11830  DebugLoc dl = bInstr->getDebugLoc();
11831  MachineOperand& destOper = bInstr->getOperand(0);
11832  MachineOperand* argOpers[2 + X86::AddrNumOperands];
11833  int numArgs = bInstr->getNumOperands() - 1;
11834  for (int i=0; i < numArgs; ++i)
11835    argOpers[i] = &bInstr->getOperand(i+1);
11836
11837  // x86 address has 4 operands: base, index, scale, and displacement
11838  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
11839  int valArgIndx = lastAddrIndx + 1;
11840
11841  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
11842  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
11843  for (int i=0; i <= lastAddrIndx; ++i)
11844    (*MIB).addOperand(*argOpers[i]);
11845
11846  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
11847  assert((argOpers[valArgIndx]->isReg() ||
11848          argOpers[valArgIndx]->isImm()) &&
11849         "invalid operand");
11850  if (argOpers[valArgIndx]->isReg())
11851    MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
11852  else
11853    MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
11854  MIB.addReg(t1);
11855  (*MIB).addOperand(*argOpers[valArgIndx]);
11856
11857  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
11858  if (Invert) {
11859    MIB = BuildMI(newMBB, dl, TII->get(notOpc), t3).addReg(t2);
11860  }
11861  else
11862    t3 = t2;
11863
11864  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg);
11865  MIB.addReg(t1);
11866
11867  MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
11868  for (int i=0; i <= lastAddrIndx; ++i)
11869    (*MIB).addOperand(*argOpers[i]);
11870  MIB.addReg(t3);
11871  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
11872  (*MIB).setMemRefs(bInstr->memoperands_begin(),
11873                    bInstr->memoperands_end());
11874
11875  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
11876  MIB.addReg(EAXreg);
11877
11878  // insert branch
11879  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
11880
11881  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
11882  return nextMBB;
11883}
11884
11885// private utility function:  64 bit atomics on 32 bit host.
11886MachineBasicBlock *
11887X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
11888                                                       MachineBasicBlock *MBB,
11889                                                       unsigned regOpcL,
11890                                                       unsigned regOpcH,
11891                                                       unsigned immOpcL,
11892                                                       unsigned immOpcH,
11893                                                       bool Invert) const {
11894  // For the atomic bitwise operator, we generate
11895  //   thisMBB (instructions are in pairs, except cmpxchg8b)
11896  //     ld t1,t2 = [bitinstr.addr]
11897  //   newMBB:
11898  //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
11899  //     op  t5, t6 <- out1, out2, [bitinstr.val]
11900  //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
11901  //     neg t7, t8 < t5, t6  (if Invert)
11902  //     mov ECX, EBX <- t5, t6
11903  //     mov EAX, EDX <- t1, t2
11904  //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
11905  //     mov t3, t4 <- EAX, EDX
11906  //     bz  newMBB
11907  //     result in out1, out2
11908  //     fallthrough -->nextMBB
11909
11910  const TargetRegisterClass *RC = &X86::GR32RegClass;
11911  const unsigned LoadOpc = X86::MOV32rm;
11912  const unsigned NotOpc = X86::NOT32r;
11913  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
11914  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
11915  MachineFunction::iterator MBBIter = MBB;
11916  ++MBBIter;
11917
11918  /// First build the CFG
11919  MachineFunction *F = MBB->getParent();
11920  MachineBasicBlock *thisMBB = MBB;
11921  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
11922  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
11923  F->insert(MBBIter, newMBB);
11924  F->insert(MBBIter, nextMBB);
11925
11926  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
11927  nextMBB->splice(nextMBB->begin(), thisMBB,
11928                  llvm::next(MachineBasicBlock::iterator(bInstr)),
11929                  thisMBB->end());
11930  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
11931
11932  // Update thisMBB to fall through to newMBB
11933  thisMBB->addSuccessor(newMBB);
11934
11935  // newMBB jumps to itself and fall through to nextMBB
11936  newMBB->addSuccessor(nextMBB);
11937  newMBB->addSuccessor(newMBB);
11938
11939  DebugLoc dl = bInstr->getDebugLoc();
11940  // Insert instructions into newMBB based on incoming instruction
11941  // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
11942  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 &&
11943         "unexpected number of operands");
11944  MachineOperand& dest1Oper = bInstr->getOperand(0);
11945  MachineOperand& dest2Oper = bInstr->getOperand(1);
11946  MachineOperand* argOpers[2 + X86::AddrNumOperands];
11947  for (int i=0; i < 2 + X86::AddrNumOperands; ++i) {
11948    argOpers[i] = &bInstr->getOperand(i+2);
11949
11950    // We use some of the operands multiple times, so conservatively just
11951    // clear any kill flags that might be present.
11952    if (argOpers[i]->isReg() && argOpers[i]->isUse())
11953      argOpers[i]->setIsKill(false);
11954  }
11955
11956  // x86 address has 5 operands: base, index, scale, displacement, and segment.
11957  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
11958
11959  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
11960  MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
11961  for (int i=0; i <= lastAddrIndx; ++i)
11962    (*MIB).addOperand(*argOpers[i]);
11963  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
11964  MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
11965  // add 4 to displacement.
11966  for (int i=0; i <= lastAddrIndx-2; ++i)
11967    (*MIB).addOperand(*argOpers[i]);
11968  MachineOperand newOp3 = *(argOpers[3]);
11969  if (newOp3.isImm())
11970    newOp3.setImm(newOp3.getImm()+4);
11971  else
11972    newOp3.setOffset(newOp3.getOffset()+4);
11973  (*MIB).addOperand(newOp3);
11974  (*MIB).addOperand(*argOpers[lastAddrIndx]);
11975
11976  // t3/4 are defined later, at the bottom of the loop
11977  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
11978  unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
11979  BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
11980    .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
11981  BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
11982    .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
11983
11984  // The subsequent operations should be using the destination registers of
11985  // the PHI instructions.
11986  t1 = dest1Oper.getReg();
11987  t2 = dest2Oper.getReg();
11988
11989  int valArgIndx = lastAddrIndx + 1;
11990  assert((argOpers[valArgIndx]->isReg() ||
11991          argOpers[valArgIndx]->isImm()) &&
11992         "invalid operand");
11993  unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
11994  unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
11995  if (argOpers[valArgIndx]->isReg())
11996    MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
11997  else
11998    MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
11999  if (regOpcL != X86::MOV32rr)
12000    MIB.addReg(t1);
12001  (*MIB).addOperand(*argOpers[valArgIndx]);
12002  assert(argOpers[valArgIndx + 1]->isReg() ==
12003         argOpers[valArgIndx]->isReg());
12004  assert(argOpers[valArgIndx + 1]->isImm() ==
12005         argOpers[valArgIndx]->isImm());
12006  if (argOpers[valArgIndx + 1]->isReg())
12007    MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
12008  else
12009    MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
12010  if (regOpcH != X86::MOV32rr)
12011    MIB.addReg(t2);
12012  (*MIB).addOperand(*argOpers[valArgIndx + 1]);
12013
12014  unsigned t7, t8;
12015  if (Invert) {
12016    t7 = F->getRegInfo().createVirtualRegister(RC);
12017    t8 = F->getRegInfo().createVirtualRegister(RC);
12018    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t7).addReg(t5);
12019    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t8).addReg(t6);
12020  } else {
12021    t7 = t5;
12022    t8 = t6;
12023  }
12024
12025  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
12026  MIB.addReg(t1);
12027  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX);
12028  MIB.addReg(t2);
12029
12030  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX);
12031  MIB.addReg(t7);
12032  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX);
12033  MIB.addReg(t8);
12034
12035  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
12036  for (int i=0; i <= lastAddrIndx; ++i)
12037    (*MIB).addOperand(*argOpers[i]);
12038
12039  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
12040  (*MIB).setMemRefs(bInstr->memoperands_begin(),
12041                    bInstr->memoperands_end());
12042
12043  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3);
12044  MIB.addReg(X86::EAX);
12045  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4);
12046  MIB.addReg(X86::EDX);
12047
12048  // insert branch
12049  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
12050
12051  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
12052  return nextMBB;
12053}
12054
12055// private utility function
12056MachineBasicBlock *
12057X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
12058                                                      MachineBasicBlock *MBB,
12059                                                      unsigned cmovOpc) const {
12060  // For the atomic min/max operator, we generate
12061  //   thisMBB:
12062  //   newMBB:
12063  //     ld t1 = [min/max.addr]
12064  //     mov t2 = [min/max.val]
12065  //     cmp  t1, t2
12066  //     cmov[cond] t2 = t1
12067  //     mov EAX = t1
12068  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
12069  //     bz   newMBB
12070  //     fallthrough -->nextMBB
12071  //
12072  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
12073  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
12074  MachineFunction::iterator MBBIter = MBB;
12075  ++MBBIter;
12076
12077  /// First build the CFG
12078  MachineFunction *F = MBB->getParent();
12079  MachineBasicBlock *thisMBB = MBB;
12080  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
12081  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
12082  F->insert(MBBIter, newMBB);
12083  F->insert(MBBIter, nextMBB);
12084
12085  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
12086  nextMBB->splice(nextMBB->begin(), thisMBB,
12087                  llvm::next(MachineBasicBlock::iterator(mInstr)),
12088                  thisMBB->end());
12089  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
12090
12091  // Update thisMBB to fall through to newMBB
12092  thisMBB->addSuccessor(newMBB);
12093
12094  // newMBB jumps to newMBB and fall through to nextMBB
12095  newMBB->addSuccessor(nextMBB);
12096  newMBB->addSuccessor(newMBB);
12097
12098  DebugLoc dl = mInstr->getDebugLoc();
12099  // Insert instructions into newMBB based on incoming instruction
12100  assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
12101         "unexpected number of operands");
12102  MachineOperand& destOper = mInstr->getOperand(0);
12103  MachineOperand* argOpers[2 + X86::AddrNumOperands];
12104  int numArgs = mInstr->getNumOperands() - 1;
12105  for (int i=0; i < numArgs; ++i)
12106    argOpers[i] = &mInstr->getOperand(i+1);
12107
12108  // x86 address has 4 operands: base, index, scale, and displacement
12109  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
12110  int valArgIndx = lastAddrIndx + 1;
12111
12112  unsigned t1 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
12113  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
12114  for (int i=0; i <= lastAddrIndx; ++i)
12115    (*MIB).addOperand(*argOpers[i]);
12116
12117  // We only support register and immediate values
12118  assert((argOpers[valArgIndx]->isReg() ||
12119          argOpers[valArgIndx]->isImm()) &&
12120         "invalid operand");
12121
12122  unsigned t2 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
12123  if (argOpers[valArgIndx]->isReg())
12124    MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2);
12125  else
12126    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
12127  (*MIB).addOperand(*argOpers[valArgIndx]);
12128
12129  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
12130  MIB.addReg(t1);
12131
12132  MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
12133  MIB.addReg(t1);
12134  MIB.addReg(t2);
12135
12136  // Generate movc
12137  unsigned t3 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
12138  MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
12139  MIB.addReg(t2);
12140  MIB.addReg(t1);
12141
12142  // Cmp and exchange if none has modified the memory location
12143  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
12144  for (int i=0; i <= lastAddrIndx; ++i)
12145    (*MIB).addOperand(*argOpers[i]);
12146  MIB.addReg(t3);
12147  assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
12148  (*MIB).setMemRefs(mInstr->memoperands_begin(),
12149                    mInstr->memoperands_end());
12150
12151  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
12152  MIB.addReg(X86::EAX);
12153
12154  // insert branch
12155  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
12156
12157  mInstr->eraseFromParent();   // The pseudo instruction is gone now.
12158  return nextMBB;
12159}
12160
12161// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
12162// or XMM0_V32I8 in AVX all of this code can be replaced with that
12163// in the .td file.
12164MachineBasicBlock *
12165X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
12166                            unsigned numArgs, bool memArg) const {
12167  assert(Subtarget->hasSSE42() &&
12168         "Target must have SSE4.2 or AVX features enabled");
12169
12170  DebugLoc dl = MI->getDebugLoc();
12171  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
12172  unsigned Opc;
12173  if (!Subtarget->hasAVX()) {
12174    if (memArg)
12175      Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
12176    else
12177      Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
12178  } else {
12179    if (memArg)
12180      Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm;
12181    else
12182      Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr;
12183  }
12184
12185  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
12186  for (unsigned i = 0; i < numArgs; ++i) {
12187    MachineOperand &Op = MI->getOperand(i+1);
12188    if (!(Op.isReg() && Op.isImplicit()))
12189      MIB.addOperand(Op);
12190  }
12191  BuildMI(*BB, MI, dl,
12192    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
12193    .addReg(X86::XMM0);
12194
12195  MI->eraseFromParent();
12196  return BB;
12197}
12198
12199MachineBasicBlock *
12200X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const {
12201  DebugLoc dl = MI->getDebugLoc();
12202  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
12203
12204  // Address into RAX/EAX, other two args into ECX, EDX.
12205  unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
12206  unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
12207  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
12208  for (int i = 0; i < X86::AddrNumOperands; ++i)
12209    MIB.addOperand(MI->getOperand(i));
12210
12211  unsigned ValOps = X86::AddrNumOperands;
12212  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
12213    .addReg(MI->getOperand(ValOps).getReg());
12214  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
12215    .addReg(MI->getOperand(ValOps+1).getReg());
12216
12217  // The instruction doesn't actually take any operands though.
12218  BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
12219
12220  MI->eraseFromParent(); // The pseudo is gone now.
12221  return BB;
12222}
12223
12224MachineBasicBlock *
12225X86TargetLowering::EmitVAARG64WithCustomInserter(
12226                   MachineInstr *MI,
12227                   MachineBasicBlock *MBB) const {
12228  // Emit va_arg instruction on X86-64.
12229
12230  // Operands to this pseudo-instruction:
12231  // 0  ) Output        : destination address (reg)
12232  // 1-5) Input         : va_list address (addr, i64mem)
12233  // 6  ) ArgSize       : Size (in bytes) of vararg type
12234  // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
12235  // 8  ) Align         : Alignment of type
12236  // 9  ) EFLAGS (implicit-def)
12237
12238  assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
12239  assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
12240
12241  unsigned DestReg = MI->getOperand(0).getReg();
12242  MachineOperand &Base = MI->getOperand(1);
12243  MachineOperand &Scale = MI->getOperand(2);
12244  MachineOperand &Index = MI->getOperand(3);
12245  MachineOperand &Disp = MI->getOperand(4);
12246  MachineOperand &Segment = MI->getOperand(5);
12247  unsigned ArgSize = MI->getOperand(6).getImm();
12248  unsigned ArgMode = MI->getOperand(7).getImm();
12249  unsigned Align = MI->getOperand(8).getImm();
12250
12251  // Memory Reference
12252  assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
12253  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
12254  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
12255
12256  // Machine Information
12257  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
12258  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
12259  const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
12260  const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
12261  DebugLoc DL = MI->getDebugLoc();
12262
12263  // struct va_list {
12264  //   i32   gp_offset
12265  //   i32   fp_offset
12266  //   i64   overflow_area (address)
12267  //   i64   reg_save_area (address)
12268  // }
12269  // sizeof(va_list) = 24
12270  // alignment(va_list) = 8
12271
12272  unsigned TotalNumIntRegs = 6;
12273  unsigned TotalNumXMMRegs = 8;
12274  bool UseGPOffset = (ArgMode == 1);
12275  bool UseFPOffset = (ArgMode == 2);
12276  unsigned MaxOffset = TotalNumIntRegs * 8 +
12277                       (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
12278
12279  /* Align ArgSize to a multiple of 8 */
12280  unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
12281  bool NeedsAlign = (Align > 8);
12282
12283  MachineBasicBlock *thisMBB = MBB;
12284  MachineBasicBlock *overflowMBB;
12285  MachineBasicBlock *offsetMBB;
12286  MachineBasicBlock *endMBB;
12287
12288  unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
12289  unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
12290  unsigned OffsetReg = 0;
12291
12292  if (!UseGPOffset && !UseFPOffset) {
12293    // If we only pull from the overflow region, we don't create a branch.
12294    // We don't need to alter control flow.
12295    OffsetDestReg = 0; // unused
12296    OverflowDestReg = DestReg;
12297
12298    offsetMBB = NULL;
12299    overflowMBB = thisMBB;
12300    endMBB = thisMBB;
12301  } else {
12302    // First emit code to check if gp_offset (or fp_offset) is below the bound.
12303    // If so, pull the argument from reg_save_area. (branch to offsetMBB)
12304    // If not, pull from overflow_area. (branch to overflowMBB)
12305    //
12306    //       thisMBB
12307    //         |     .
12308    //         |        .
12309    //     offsetMBB   overflowMBB
12310    //         |        .
12311    //         |     .
12312    //        endMBB
12313
12314    // Registers for the PHI in endMBB
12315    OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
12316    OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
12317
12318    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
12319    MachineFunction *MF = MBB->getParent();
12320    overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
12321    offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
12322    endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
12323
12324    MachineFunction::iterator MBBIter = MBB;
12325    ++MBBIter;
12326
12327    // Insert the new basic blocks
12328    MF->insert(MBBIter, offsetMBB);
12329    MF->insert(MBBIter, overflowMBB);
12330    MF->insert(MBBIter, endMBB);
12331
12332    // Transfer the remainder of MBB and its successor edges to endMBB.
12333    endMBB->splice(endMBB->begin(), thisMBB,
12334                    llvm::next(MachineBasicBlock::iterator(MI)),
12335                    thisMBB->end());
12336    endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
12337
12338    // Make offsetMBB and overflowMBB successors of thisMBB
12339    thisMBB->addSuccessor(offsetMBB);
12340    thisMBB->addSuccessor(overflowMBB);
12341
12342    // endMBB is a successor of both offsetMBB and overflowMBB
12343    offsetMBB->addSuccessor(endMBB);
12344    overflowMBB->addSuccessor(endMBB);
12345
12346    // Load the offset value into a register
12347    OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
12348    BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
12349      .addOperand(Base)
12350      .addOperand(Scale)
12351      .addOperand(Index)
12352      .addDisp(Disp, UseFPOffset ? 4 : 0)
12353      .addOperand(Segment)
12354      .setMemRefs(MMOBegin, MMOEnd);
12355
12356    // Check if there is enough room left to pull this argument.
12357    BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
12358      .addReg(OffsetReg)
12359      .addImm(MaxOffset + 8 - ArgSizeA8);
12360
12361    // Branch to "overflowMBB" if offset >= max
12362    // Fall through to "offsetMBB" otherwise
12363    BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
12364      .addMBB(overflowMBB);
12365  }
12366
12367  // In offsetMBB, emit code to use the reg_save_area.
12368  if (offsetMBB) {
12369    assert(OffsetReg != 0);
12370
12371    // Read the reg_save_area address.
12372    unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
12373    BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
12374      .addOperand(Base)
12375      .addOperand(Scale)
12376      .addOperand(Index)
12377      .addDisp(Disp, 16)
12378      .addOperand(Segment)
12379      .setMemRefs(MMOBegin, MMOEnd);
12380
12381    // Zero-extend the offset
12382    unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
12383      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
12384        .addImm(0)
12385        .addReg(OffsetReg)
12386        .addImm(X86::sub_32bit);
12387
12388    // Add the offset to the reg_save_area to get the final address.
12389    BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
12390      .addReg(OffsetReg64)
12391      .addReg(RegSaveReg);
12392
12393    // Compute the offset for the next argument
12394    unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
12395    BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
12396      .addReg(OffsetReg)
12397      .addImm(UseFPOffset ? 16 : 8);
12398
12399    // Store it back into the va_list.
12400    BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
12401      .addOperand(Base)
12402      .addOperand(Scale)
12403      .addOperand(Index)
12404      .addDisp(Disp, UseFPOffset ? 4 : 0)
12405      .addOperand(Segment)
12406      .addReg(NextOffsetReg)
12407      .setMemRefs(MMOBegin, MMOEnd);
12408
12409    // Jump to endMBB
12410    BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
12411      .addMBB(endMBB);
12412  }
12413
12414  //
12415  // Emit code to use overflow area
12416  //
12417
12418  // Load the overflow_area address into a register.
12419  unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
12420  BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
12421    .addOperand(Base)
12422    .addOperand(Scale)
12423    .addOperand(Index)
12424    .addDisp(Disp, 8)
12425    .addOperand(Segment)
12426    .setMemRefs(MMOBegin, MMOEnd);
12427
12428  // If we need to align it, do so. Otherwise, just copy the address
12429  // to OverflowDestReg.
12430  if (NeedsAlign) {
12431    // Align the overflow address
12432    assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
12433    unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
12434
12435    // aligned_addr = (addr + (align-1)) & ~(align-1)
12436    BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
12437      .addReg(OverflowAddrReg)
12438      .addImm(Align-1);
12439
12440    BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
12441      .addReg(TmpReg)
12442      .addImm(~(uint64_t)(Align-1));
12443  } else {
12444    BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
12445      .addReg(OverflowAddrReg);
12446  }
12447
12448  // Compute the next overflow address after this argument.
12449  // (the overflow address should be kept 8-byte aligned)
12450  unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
12451  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
12452    .addReg(OverflowDestReg)
12453    .addImm(ArgSizeA8);
12454
12455  // Store the new overflow address.
12456  BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
12457    .addOperand(Base)
12458    .addOperand(Scale)
12459    .addOperand(Index)
12460    .addDisp(Disp, 8)
12461    .addOperand(Segment)
12462    .addReg(NextAddrReg)
12463    .setMemRefs(MMOBegin, MMOEnd);
12464
12465  // If we branched, emit the PHI to the front of endMBB.
12466  if (offsetMBB) {
12467    BuildMI(*endMBB, endMBB->begin(), DL,
12468            TII->get(X86::PHI), DestReg)
12469      .addReg(OffsetDestReg).addMBB(offsetMBB)
12470      .addReg(OverflowDestReg).addMBB(overflowMBB);
12471  }
12472
12473  // Erase the pseudo instruction
12474  MI->eraseFromParent();
12475
12476  return endMBB;
12477}
12478
12479MachineBasicBlock *
12480X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
12481                                                 MachineInstr *MI,
12482                                                 MachineBasicBlock *MBB) const {
12483  // Emit code to save XMM registers to the stack. The ABI says that the
12484  // number of registers to save is given in %al, so it's theoretically
12485  // possible to do an indirect jump trick to avoid saving all of them,
12486  // however this code takes a simpler approach and just executes all
12487  // of the stores if %al is non-zero. It's less code, and it's probably
12488  // easier on the hardware branch predictor, and stores aren't all that
12489  // expensive anyway.
12490
12491  // Create the new basic blocks. One block contains all the XMM stores,
12492  // and one block is the final destination regardless of whether any
12493  // stores were performed.
12494  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
12495  MachineFunction *F = MBB->getParent();
12496  MachineFunction::iterator MBBIter = MBB;
12497  ++MBBIter;
12498  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
12499  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
12500  F->insert(MBBIter, XMMSaveMBB);
12501  F->insert(MBBIter, EndMBB);
12502
12503  // Transfer the remainder of MBB and its successor edges to EndMBB.
12504  EndMBB->splice(EndMBB->begin(), MBB,
12505                 llvm::next(MachineBasicBlock::iterator(MI)),
12506                 MBB->end());
12507  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
12508
12509  // The original block will now fall through to the XMM save block.
12510  MBB->addSuccessor(XMMSaveMBB);
12511  // The XMMSaveMBB will fall through to the end block.
12512  XMMSaveMBB->addSuccessor(EndMBB);
12513
12514  // Now add the instructions.
12515  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
12516  DebugLoc DL = MI->getDebugLoc();
12517
12518  unsigned CountReg = MI->getOperand(0).getReg();
12519  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
12520  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
12521
12522  if (!Subtarget->isTargetWin64()) {
12523    // If %al is 0, branch around the XMM save block.
12524    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
12525    BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
12526    MBB->addSuccessor(EndMBB);
12527  }
12528
12529  unsigned MOVOpc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
12530  // In the XMM save block, save all the XMM argument registers.
12531  for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
12532    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
12533    MachineMemOperand *MMO =
12534      F->getMachineMemOperand(
12535          MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
12536        MachineMemOperand::MOStore,
12537        /*Size=*/16, /*Align=*/16);
12538    BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
12539      .addFrameIndex(RegSaveFrameIndex)
12540      .addImm(/*Scale=*/1)
12541      .addReg(/*IndexReg=*/0)
12542      .addImm(/*Disp=*/Offset)
12543      .addReg(/*Segment=*/0)
12544      .addReg(MI->getOperand(i).getReg())
12545      .addMemOperand(MMO);
12546  }
12547
12548  MI->eraseFromParent();   // The pseudo instruction is gone now.
12549
12550  return EndMBB;
12551}
12552
12553// The EFLAGS operand of SelectItr might be missing a kill marker
12554// because there were multiple uses of EFLAGS, and ISel didn't know
12555// which to mark. Figure out whether SelectItr should have had a
12556// kill marker, and set it if it should. Returns the correct kill
12557// marker value.
12558static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
12559                                     MachineBasicBlock* BB,
12560                                     const TargetRegisterInfo* TRI) {
12561  // Scan forward through BB for a use/def of EFLAGS.
12562  MachineBasicBlock::iterator miI(llvm::next(SelectItr));
12563  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
12564    const MachineInstr& mi = *miI;
12565    if (mi.readsRegister(X86::EFLAGS))
12566      return false;
12567    if (mi.definesRegister(X86::EFLAGS))
12568      break; // Should have kill-flag - update below.
12569  }
12570
12571  // If we hit the end of the block, check whether EFLAGS is live into a
12572  // successor.
12573  if (miI == BB->end()) {
12574    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
12575                                          sEnd = BB->succ_end();
12576         sItr != sEnd; ++sItr) {
12577      MachineBasicBlock* succ = *sItr;
12578      if (succ->isLiveIn(X86::EFLAGS))
12579        return false;
12580    }
12581  }
12582
12583  // We found a def, or hit the end of the basic block and EFLAGS wasn't live
12584  // out. SelectMI should have a kill flag on EFLAGS.
12585  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
12586  return true;
12587}
12588
12589MachineBasicBlock *
12590X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
12591                                     MachineBasicBlock *BB) const {
12592  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
12593  DebugLoc DL = MI->getDebugLoc();
12594
12595  // To "insert" a SELECT_CC instruction, we actually have to insert the
12596  // diamond control-flow pattern.  The incoming instruction knows the
12597  // destination vreg to set, the condition code register to branch on, the
12598  // true/false values to select between, and a branch opcode to use.
12599  const BasicBlock *LLVM_BB = BB->getBasicBlock();
12600  MachineFunction::iterator It = BB;
12601  ++It;
12602
12603  //  thisMBB:
12604  //  ...
12605  //   TrueVal = ...
12606  //   cmpTY ccX, r1, r2
12607  //   bCC copy1MBB
12608  //   fallthrough --> copy0MBB
12609  MachineBasicBlock *thisMBB = BB;
12610  MachineFunction *F = BB->getParent();
12611  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
12612  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
12613  F->insert(It, copy0MBB);
12614  F->insert(It, sinkMBB);
12615
12616  // If the EFLAGS register isn't dead in the terminator, then claim that it's
12617  // live into the sink and copy blocks.
12618  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
12619  if (!MI->killsRegister(X86::EFLAGS) &&
12620      !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
12621    copy0MBB->addLiveIn(X86::EFLAGS);
12622    sinkMBB->addLiveIn(X86::EFLAGS);
12623  }
12624
12625  // Transfer the remainder of BB and its successor edges to sinkMBB.
12626  sinkMBB->splice(sinkMBB->begin(), BB,
12627                  llvm::next(MachineBasicBlock::iterator(MI)),
12628                  BB->end());
12629  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
12630
12631  // Add the true and fallthrough blocks as its successors.
12632  BB->addSuccessor(copy0MBB);
12633  BB->addSuccessor(sinkMBB);
12634
12635  // Create the conditional branch instruction.
12636  unsigned Opc =
12637    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
12638  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
12639
12640  //  copy0MBB:
12641  //   %FalseValue = ...
12642  //   # fallthrough to sinkMBB
12643  copy0MBB->addSuccessor(sinkMBB);
12644
12645  //  sinkMBB:
12646  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12647  //  ...
12648  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
12649          TII->get(X86::PHI), MI->getOperand(0).getReg())
12650    .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
12651    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
12652
12653  MI->eraseFromParent();   // The pseudo instruction is gone now.
12654  return sinkMBB;
12655}
12656
12657MachineBasicBlock *
12658X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
12659                                        bool Is64Bit) const {
12660  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
12661  DebugLoc DL = MI->getDebugLoc();
12662  MachineFunction *MF = BB->getParent();
12663  const BasicBlock *LLVM_BB = BB->getBasicBlock();
12664
12665  assert(getTargetMachine().Options.EnableSegmentedStacks);
12666
12667  unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
12668  unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
12669
12670  // BB:
12671  //  ... [Till the alloca]
12672  // If stacklet is not large enough, jump to mallocMBB
12673  //
12674  // bumpMBB:
12675  //  Allocate by subtracting from RSP
12676  //  Jump to continueMBB
12677  //
12678  // mallocMBB:
12679  //  Allocate by call to runtime
12680  //
12681  // continueMBB:
12682  //  ...
12683  //  [rest of original BB]
12684  //
12685
12686  MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
12687  MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
12688  MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
12689
12690  MachineRegisterInfo &MRI = MF->getRegInfo();
12691  const TargetRegisterClass *AddrRegClass =
12692    getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
12693
12694  unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
12695    bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
12696    tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
12697    SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
12698    sizeVReg = MI->getOperand(1).getReg(),
12699    physSPReg = Is64Bit ? X86::RSP : X86::ESP;
12700
12701  MachineFunction::iterator MBBIter = BB;
12702  ++MBBIter;
12703
12704  MF->insert(MBBIter, bumpMBB);
12705  MF->insert(MBBIter, mallocMBB);
12706  MF->insert(MBBIter, continueMBB);
12707
12708  continueMBB->splice(continueMBB->begin(), BB, llvm::next
12709                      (MachineBasicBlock::iterator(MI)), BB->end());
12710  continueMBB->transferSuccessorsAndUpdatePHIs(BB);
12711
12712  // Add code to the main basic block to check if the stack limit has been hit,
12713  // and if so, jump to mallocMBB otherwise to bumpMBB.
12714  BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
12715  BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
12716    .addReg(tmpSPVReg).addReg(sizeVReg);
12717  BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
12718    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
12719    .addReg(SPLimitVReg);
12720  BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
12721
12722  // bumpMBB simply decreases the stack pointer, since we know the current
12723  // stacklet has enough space.
12724  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
12725    .addReg(SPLimitVReg);
12726  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
12727    .addReg(SPLimitVReg);
12728  BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
12729
12730  // Calls into a routine in libgcc to allocate more space from the heap.
12731  const uint32_t *RegMask =
12732    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
12733  if (Is64Bit) {
12734    BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
12735      .addReg(sizeVReg);
12736    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
12737      .addExternalSymbol("__morestack_allocate_stack_space")
12738      .addRegMask(RegMask)
12739      .addReg(X86::RDI, RegState::Implicit)
12740      .addReg(X86::RAX, RegState::ImplicitDefine);
12741  } else {
12742    BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
12743      .addImm(12);
12744    BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
12745    BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
12746      .addExternalSymbol("__morestack_allocate_stack_space")
12747      .addRegMask(RegMask)
12748      .addReg(X86::EAX, RegState::ImplicitDefine);
12749  }
12750
12751  if (!Is64Bit)
12752    BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
12753      .addImm(16);
12754
12755  BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
12756    .addReg(Is64Bit ? X86::RAX : X86::EAX);
12757  BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
12758
12759  // Set up the CFG correctly.
12760  BB->addSuccessor(bumpMBB);
12761  BB->addSuccessor(mallocMBB);
12762  mallocMBB->addSuccessor(continueMBB);
12763  bumpMBB->addSuccessor(continueMBB);
12764
12765  // Take care of the PHI nodes.
12766  BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
12767          MI->getOperand(0).getReg())
12768    .addReg(mallocPtrVReg).addMBB(mallocMBB)
12769    .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
12770
12771  // Delete the original pseudo instruction.
12772  MI->eraseFromParent();
12773
12774  // And we're done.
12775  return continueMBB;
12776}
12777
12778MachineBasicBlock *
12779X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
12780                                          MachineBasicBlock *BB) const {
12781  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
12782  DebugLoc DL = MI->getDebugLoc();
12783
12784  assert(!Subtarget->isTargetEnvMacho());
12785
12786  // The lowering is pretty easy: we're just emitting the call to _alloca.  The
12787  // non-trivial part is impdef of ESP.
12788
12789  if (Subtarget->isTargetWin64()) {
12790    if (Subtarget->isTargetCygMing()) {
12791      // ___chkstk(Mingw64):
12792      // Clobbers R10, R11, RAX and EFLAGS.
12793      // Updates RSP.
12794      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
12795        .addExternalSymbol("___chkstk")
12796        .addReg(X86::RAX, RegState::Implicit)
12797        .addReg(X86::RSP, RegState::Implicit)
12798        .addReg(X86::RAX, RegState::Define | RegState::Implicit)
12799        .addReg(X86::RSP, RegState::Define | RegState::Implicit)
12800        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
12801    } else {
12802      // __chkstk(MSVCRT): does not update stack pointer.
12803      // Clobbers R10, R11 and EFLAGS.
12804      // FIXME: RAX(allocated size) might be reused and not killed.
12805      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
12806        .addExternalSymbol("__chkstk")
12807        .addReg(X86::RAX, RegState::Implicit)
12808        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
12809      // RAX has the offset to subtracted from RSP.
12810      BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
12811        .addReg(X86::RSP)
12812        .addReg(X86::RAX);
12813    }
12814  } else {
12815    const char *StackProbeSymbol =
12816      Subtarget->isTargetWindows() ? "_chkstk" : "_alloca";
12817
12818    BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
12819      .addExternalSymbol(StackProbeSymbol)
12820      .addReg(X86::EAX, RegState::Implicit)
12821      .addReg(X86::ESP, RegState::Implicit)
12822      .addReg(X86::EAX, RegState::Define | RegState::Implicit)
12823      .addReg(X86::ESP, RegState::Define | RegState::Implicit)
12824      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
12825  }
12826
12827  MI->eraseFromParent();   // The pseudo instruction is gone now.
12828  return BB;
12829}
12830
12831MachineBasicBlock *
12832X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
12833                                      MachineBasicBlock *BB) const {
12834  // This is pretty easy.  We're taking the value that we received from
12835  // our load from the relocation, sticking it in either RDI (x86-64)
12836  // or EAX and doing an indirect call.  The return value will then
12837  // be in the normal return register.
12838  const X86InstrInfo *TII
12839    = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
12840  DebugLoc DL = MI->getDebugLoc();
12841  MachineFunction *F = BB->getParent();
12842
12843  assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
12844  assert(MI->getOperand(3).isGlobal() && "This should be a global");
12845
12846  // Get a register mask for the lowered call.
12847  // FIXME: The 32-bit calls have non-standard calling conventions. Use a
12848  // proper register mask.
12849  const uint32_t *RegMask =
12850    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
12851  if (Subtarget->is64Bit()) {
12852    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
12853                                      TII->get(X86::MOV64rm), X86::RDI)
12854    .addReg(X86::RIP)
12855    .addImm(0).addReg(0)
12856    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
12857                      MI->getOperand(3).getTargetFlags())
12858    .addReg(0);
12859    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
12860    addDirectMem(MIB, X86::RDI);
12861    MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
12862  } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
12863    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
12864                                      TII->get(X86::MOV32rm), X86::EAX)
12865    .addReg(0)
12866    .addImm(0).addReg(0)
12867    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
12868                      MI->getOperand(3).getTargetFlags())
12869    .addReg(0);
12870    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
12871    addDirectMem(MIB, X86::EAX);
12872    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
12873  } else {
12874    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
12875                                      TII->get(X86::MOV32rm), X86::EAX)
12876    .addReg(TII->getGlobalBaseReg(F))
12877    .addImm(0).addReg(0)
12878    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
12879                      MI->getOperand(3).getTargetFlags())
12880    .addReg(0);
12881    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
12882    addDirectMem(MIB, X86::EAX);
12883    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
12884  }
12885
12886  MI->eraseFromParent(); // The pseudo instruction is gone now.
12887  return BB;
12888}
12889
12890MachineBasicBlock *
12891X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
12892                                               MachineBasicBlock *BB) const {
12893  switch (MI->getOpcode()) {
12894  default: llvm_unreachable("Unexpected instr type to insert");
12895  case X86::TAILJMPd64:
12896  case X86::TAILJMPr64:
12897  case X86::TAILJMPm64:
12898    llvm_unreachable("TAILJMP64 would not be touched here.");
12899  case X86::TCRETURNdi64:
12900  case X86::TCRETURNri64:
12901  case X86::TCRETURNmi64:
12902    return BB;
12903  case X86::WIN_ALLOCA:
12904    return EmitLoweredWinAlloca(MI, BB);
12905  case X86::SEG_ALLOCA_32:
12906    return EmitLoweredSegAlloca(MI, BB, false);
12907  case X86::SEG_ALLOCA_64:
12908    return EmitLoweredSegAlloca(MI, BB, true);
12909  case X86::TLSCall_32:
12910  case X86::TLSCall_64:
12911    return EmitLoweredTLSCall(MI, BB);
12912  case X86::CMOV_GR8:
12913  case X86::CMOV_FR32:
12914  case X86::CMOV_FR64:
12915  case X86::CMOV_V4F32:
12916  case X86::CMOV_V2F64:
12917  case X86::CMOV_V2I64:
12918  case X86::CMOV_V8F32:
12919  case X86::CMOV_V4F64:
12920  case X86::CMOV_V4I64:
12921  case X86::CMOV_GR16:
12922  case X86::CMOV_GR32:
12923  case X86::CMOV_RFP32:
12924  case X86::CMOV_RFP64:
12925  case X86::CMOV_RFP80:
12926    return EmitLoweredSelect(MI, BB);
12927
12928  case X86::FP32_TO_INT16_IN_MEM:
12929  case X86::FP32_TO_INT32_IN_MEM:
12930  case X86::FP32_TO_INT64_IN_MEM:
12931  case X86::FP64_TO_INT16_IN_MEM:
12932  case X86::FP64_TO_INT32_IN_MEM:
12933  case X86::FP64_TO_INT64_IN_MEM:
12934  case X86::FP80_TO_INT16_IN_MEM:
12935  case X86::FP80_TO_INT32_IN_MEM:
12936  case X86::FP80_TO_INT64_IN_MEM: {
12937    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
12938    DebugLoc DL = MI->getDebugLoc();
12939
12940    // Change the floating point control register to use "round towards zero"
12941    // mode when truncating to an integer value.
12942    MachineFunction *F = BB->getParent();
12943    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
12944    addFrameReference(BuildMI(*BB, MI, DL,
12945                              TII->get(X86::FNSTCW16m)), CWFrameIdx);
12946
12947    // Load the old value of the high byte of the control word...
12948    unsigned OldCW =
12949      F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
12950    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
12951                      CWFrameIdx);
12952
12953    // Set the high part to be round to zero...
12954    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
12955      .addImm(0xC7F);
12956
12957    // Reload the modified control word now...
12958    addFrameReference(BuildMI(*BB, MI, DL,
12959                              TII->get(X86::FLDCW16m)), CWFrameIdx);
12960
12961    // Restore the memory image of control word to original value
12962    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
12963      .addReg(OldCW);
12964
12965    // Get the X86 opcode to use.
12966    unsigned Opc;
12967    switch (MI->getOpcode()) {
12968    default: llvm_unreachable("illegal opcode!");
12969    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
12970    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
12971    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
12972    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
12973    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
12974    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
12975    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
12976    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
12977    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
12978    }
12979
12980    X86AddressMode AM;
12981    MachineOperand &Op = MI->getOperand(0);
12982    if (Op.isReg()) {
12983      AM.BaseType = X86AddressMode::RegBase;
12984      AM.Base.Reg = Op.getReg();
12985    } else {
12986      AM.BaseType = X86AddressMode::FrameIndexBase;
12987      AM.Base.FrameIndex = Op.getIndex();
12988    }
12989    Op = MI->getOperand(1);
12990    if (Op.isImm())
12991      AM.Scale = Op.getImm();
12992    Op = MI->getOperand(2);
12993    if (Op.isImm())
12994      AM.IndexReg = Op.getImm();
12995    Op = MI->getOperand(3);
12996    if (Op.isGlobal()) {
12997      AM.GV = Op.getGlobal();
12998    } else {
12999      AM.Disp = Op.getImm();
13000    }
13001    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
13002                      .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
13003
13004    // Reload the original control word now.
13005    addFrameReference(BuildMI(*BB, MI, DL,
13006                              TII->get(X86::FLDCW16m)), CWFrameIdx);
13007
13008    MI->eraseFromParent();   // The pseudo instruction is gone now.
13009    return BB;
13010  }
13011    // String/text processing lowering.
13012  case X86::PCMPISTRM128REG:
13013  case X86::VPCMPISTRM128REG:
13014  case X86::PCMPISTRM128MEM:
13015  case X86::VPCMPISTRM128MEM:
13016  case X86::PCMPESTRM128REG:
13017  case X86::VPCMPESTRM128REG:
13018  case X86::PCMPESTRM128MEM:
13019  case X86::VPCMPESTRM128MEM: {
13020    unsigned NumArgs;
13021    bool MemArg;
13022    switch (MI->getOpcode()) {
13023    default: llvm_unreachable("illegal opcode!");
13024    case X86::PCMPISTRM128REG:
13025    case X86::VPCMPISTRM128REG:
13026      NumArgs = 3; MemArg = false; break;
13027    case X86::PCMPISTRM128MEM:
13028    case X86::VPCMPISTRM128MEM:
13029      NumArgs = 3; MemArg = true; break;
13030    case X86::PCMPESTRM128REG:
13031    case X86::VPCMPESTRM128REG:
13032      NumArgs = 5; MemArg = false; break;
13033    case X86::PCMPESTRM128MEM:
13034    case X86::VPCMPESTRM128MEM:
13035      NumArgs = 5; MemArg = true; break;
13036    }
13037    return EmitPCMP(MI, BB, NumArgs, MemArg);
13038  }
13039
13040    // Thread synchronization.
13041  case X86::MONITOR:
13042    return EmitMonitor(MI, BB);
13043
13044    // Atomic Lowering.
13045  case X86::ATOMMIN32:
13046  case X86::ATOMMAX32:
13047  case X86::ATOMUMIN32:
13048  case X86::ATOMUMAX32:
13049  case X86::ATOMMIN16:
13050  case X86::ATOMMAX16:
13051  case X86::ATOMUMIN16:
13052  case X86::ATOMUMAX16:
13053  case X86::ATOMMIN64:
13054  case X86::ATOMMAX64:
13055  case X86::ATOMUMIN64:
13056  case X86::ATOMUMAX64: {
13057    unsigned Opc;
13058    switch (MI->getOpcode()) {
13059    default: llvm_unreachable("illegal opcode!");
13060    case X86::ATOMMIN32:  Opc = X86::CMOVL32rr; break;
13061    case X86::ATOMMAX32:  Opc = X86::CMOVG32rr; break;
13062    case X86::ATOMUMIN32: Opc = X86::CMOVB32rr; break;
13063    case X86::ATOMUMAX32: Opc = X86::CMOVA32rr; break;
13064    case X86::ATOMMIN16:  Opc = X86::CMOVL16rr; break;
13065    case X86::ATOMMAX16:  Opc = X86::CMOVG16rr; break;
13066    case X86::ATOMUMIN16: Opc = X86::CMOVB16rr; break;
13067    case X86::ATOMUMAX16: Opc = X86::CMOVA16rr; break;
13068    case X86::ATOMMIN64:  Opc = X86::CMOVL64rr; break;
13069    case X86::ATOMMAX64:  Opc = X86::CMOVG64rr; break;
13070    case X86::ATOMUMIN64: Opc = X86::CMOVB64rr; break;
13071    case X86::ATOMUMAX64: Opc = X86::CMOVA64rr; break;
13072    // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
13073    }
13074    return EmitAtomicMinMaxWithCustomInserter(MI, BB, Opc);
13075  }
13076
13077  case X86::ATOMAND32:
13078  case X86::ATOMOR32:
13079  case X86::ATOMXOR32:
13080  case X86::ATOMNAND32: {
13081    bool Invert = false;
13082    unsigned RegOpc, ImmOpc;
13083    switch (MI->getOpcode()) {
13084    default: llvm_unreachable("illegal opcode!");
13085    case X86::ATOMAND32:
13086      RegOpc = X86::AND32rr; ImmOpc = X86::AND32ri; break;
13087    case X86::ATOMOR32:
13088      RegOpc = X86::OR32rr;  ImmOpc = X86::OR32ri; break;
13089    case X86::ATOMXOR32:
13090      RegOpc = X86::XOR32rr; ImmOpc = X86::XOR32ri; break;
13091    case X86::ATOMNAND32:
13092      RegOpc = X86::AND32rr; ImmOpc = X86::AND32ri; Invert = true; break;
13093    }
13094    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
13095                                               X86::MOV32rm, X86::LCMPXCHG32,
13096                                               X86::NOT32r, X86::EAX,
13097                                               &X86::GR32RegClass, Invert);
13098  }
13099
13100  case X86::ATOMAND16:
13101  case X86::ATOMOR16:
13102  case X86::ATOMXOR16:
13103  case X86::ATOMNAND16: {
13104    bool Invert = false;
13105    unsigned RegOpc, ImmOpc;
13106    switch (MI->getOpcode()) {
13107    default: llvm_unreachable("illegal opcode!");
13108    case X86::ATOMAND16:
13109      RegOpc = X86::AND16rr; ImmOpc = X86::AND16ri; break;
13110    case X86::ATOMOR16:
13111      RegOpc = X86::OR16rr;  ImmOpc = X86::OR16ri; break;
13112    case X86::ATOMXOR16:
13113      RegOpc = X86::XOR16rr; ImmOpc = X86::XOR16ri; break;
13114    case X86::ATOMNAND16:
13115      RegOpc = X86::AND16rr; ImmOpc = X86::AND16ri; Invert = true; break;
13116    }
13117    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
13118                                               X86::MOV16rm, X86::LCMPXCHG16,
13119                                               X86::NOT16r, X86::AX,
13120                                               &X86::GR16RegClass, Invert);
13121  }
13122
13123  case X86::ATOMAND8:
13124  case X86::ATOMOR8:
13125  case X86::ATOMXOR8:
13126  case X86::ATOMNAND8: {
13127    bool Invert = false;
13128    unsigned RegOpc, ImmOpc;
13129    switch (MI->getOpcode()) {
13130    default: llvm_unreachable("illegal opcode!");
13131    case X86::ATOMAND8:
13132      RegOpc = X86::AND8rr; ImmOpc = X86::AND8ri; break;
13133    case X86::ATOMOR8:
13134      RegOpc = X86::OR8rr;  ImmOpc = X86::OR8ri; break;
13135    case X86::ATOMXOR8:
13136      RegOpc = X86::XOR8rr; ImmOpc = X86::XOR8ri; break;
13137    case X86::ATOMNAND8:
13138      RegOpc = X86::AND8rr; ImmOpc = X86::AND8ri; Invert = true; break;
13139    }
13140    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
13141                                               X86::MOV8rm, X86::LCMPXCHG8,
13142                                               X86::NOT8r, X86::AL,
13143                                               &X86::GR8RegClass, Invert);
13144  }
13145
13146  // This group is for 64-bit host.
13147  case X86::ATOMAND64:
13148  case X86::ATOMOR64:
13149  case X86::ATOMXOR64:
13150  case X86::ATOMNAND64: {
13151    bool Invert = false;
13152    unsigned RegOpc, ImmOpc;
13153    switch (MI->getOpcode()) {
13154    default: llvm_unreachable("illegal opcode!");
13155    case X86::ATOMAND64:
13156      RegOpc = X86::AND64rr; ImmOpc = X86::AND64ri32; break;
13157    case X86::ATOMOR64:
13158      RegOpc = X86::OR64rr;  ImmOpc = X86::OR64ri32; break;
13159    case X86::ATOMXOR64:
13160      RegOpc = X86::XOR64rr; ImmOpc = X86::XOR64ri32; break;
13161    case X86::ATOMNAND64:
13162      RegOpc = X86::AND64rr; ImmOpc = X86::AND64ri32; Invert = true; break;
13163    }
13164    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
13165                                               X86::MOV64rm, X86::LCMPXCHG64,
13166                                               X86::NOT64r, X86::RAX,
13167                                               &X86::GR64RegClass, Invert);
13168  }
13169
13170  // This group does 64-bit operations on a 32-bit host.
13171  case X86::ATOMAND6432:
13172  case X86::ATOMOR6432:
13173  case X86::ATOMXOR6432:
13174  case X86::ATOMNAND6432:
13175  case X86::ATOMADD6432:
13176  case X86::ATOMSUB6432:
13177  case X86::ATOMSWAP6432: {
13178    bool Invert = false;
13179    unsigned RegOpcL, RegOpcH, ImmOpcL, ImmOpcH;
13180    switch (MI->getOpcode()) {
13181    default: llvm_unreachable("illegal opcode!");
13182    case X86::ATOMAND6432:
13183      RegOpcL = RegOpcH = X86::AND32rr;
13184      ImmOpcL = ImmOpcH = X86::AND32ri;
13185      break;
13186    case X86::ATOMOR6432:
13187      RegOpcL = RegOpcH = X86::OR32rr;
13188      ImmOpcL = ImmOpcH = X86::OR32ri;
13189      break;
13190    case X86::ATOMXOR6432:
13191      RegOpcL = RegOpcH = X86::XOR32rr;
13192      ImmOpcL = ImmOpcH = X86::XOR32ri;
13193      break;
13194    case X86::ATOMNAND6432:
13195      RegOpcL = RegOpcH = X86::AND32rr;
13196      ImmOpcL = ImmOpcH = X86::AND32ri;
13197      Invert = true;
13198      break;
13199    case X86::ATOMADD6432:
13200      RegOpcL = X86::ADD32rr; RegOpcH = X86::ADC32rr;
13201      ImmOpcL = X86::ADD32ri; ImmOpcH = X86::ADC32ri;
13202      break;
13203    case X86::ATOMSUB6432:
13204      RegOpcL = X86::SUB32rr; RegOpcH = X86::SBB32rr;
13205      ImmOpcL = X86::SUB32ri; ImmOpcH = X86::SBB32ri;
13206      break;
13207    case X86::ATOMSWAP6432:
13208      RegOpcL = RegOpcH = X86::MOV32rr;
13209      ImmOpcL = ImmOpcH = X86::MOV32ri;
13210      break;
13211    }
13212    return EmitAtomicBit6432WithCustomInserter(MI, BB, RegOpcL, RegOpcH,
13213                                               ImmOpcL, ImmOpcH, Invert);
13214  }
13215
13216  case X86::VASTART_SAVE_XMM_REGS:
13217    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
13218
13219  case X86::VAARG_64:
13220    return EmitVAARG64WithCustomInserter(MI, BB);
13221  }
13222}
13223
13224//===----------------------------------------------------------------------===//
13225//                           X86 Optimization Hooks
13226//===----------------------------------------------------------------------===//
13227
13228void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
13229                                                       APInt &KnownZero,
13230                                                       APInt &KnownOne,
13231                                                       const SelectionDAG &DAG,
13232                                                       unsigned Depth) const {
13233  unsigned BitWidth = KnownZero.getBitWidth();
13234  unsigned Opc = Op.getOpcode();
13235  assert((Opc >= ISD::BUILTIN_OP_END ||
13236          Opc == ISD::INTRINSIC_WO_CHAIN ||
13237          Opc == ISD::INTRINSIC_W_CHAIN ||
13238          Opc == ISD::INTRINSIC_VOID) &&
13239         "Should use MaskedValueIsZero if you don't know whether Op"
13240         " is a target node!");
13241
13242  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
13243  switch (Opc) {
13244  default: break;
13245  case X86ISD::ADD:
13246  case X86ISD::SUB:
13247  case X86ISD::ADC:
13248  case X86ISD::SBB:
13249  case X86ISD::SMUL:
13250  case X86ISD::UMUL:
13251  case X86ISD::INC:
13252  case X86ISD::DEC:
13253  case X86ISD::OR:
13254  case X86ISD::XOR:
13255  case X86ISD::AND:
13256    // These nodes' second result is a boolean.
13257    if (Op.getResNo() == 0)
13258      break;
13259    // Fallthrough
13260  case X86ISD::SETCC:
13261    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
13262    break;
13263  case ISD::INTRINSIC_WO_CHAIN: {
13264    unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
13265    unsigned NumLoBits = 0;
13266    switch (IntId) {
13267    default: break;
13268    case Intrinsic::x86_sse_movmsk_ps:
13269    case Intrinsic::x86_avx_movmsk_ps_256:
13270    case Intrinsic::x86_sse2_movmsk_pd:
13271    case Intrinsic::x86_avx_movmsk_pd_256:
13272    case Intrinsic::x86_mmx_pmovmskb:
13273    case Intrinsic::x86_sse2_pmovmskb_128:
13274    case Intrinsic::x86_avx2_pmovmskb: {
13275      // High bits of movmskp{s|d}, pmovmskb are known zero.
13276      switch (IntId) {
13277        default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
13278        case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
13279        case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
13280        case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
13281        case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
13282        case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
13283        case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
13284        case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
13285      }
13286      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
13287      break;
13288    }
13289    }
13290    break;
13291  }
13292  }
13293}
13294
13295unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
13296                                                         unsigned Depth) const {
13297  // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
13298  if (Op.getOpcode() == X86ISD::SETCC_CARRY)
13299    return Op.getValueType().getScalarType().getSizeInBits();
13300
13301  // Fallback case.
13302  return 1;
13303}
13304
13305/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
13306/// node is a GlobalAddress + offset.
13307bool X86TargetLowering::isGAPlusOffset(SDNode *N,
13308                                       const GlobalValue* &GA,
13309                                       int64_t &Offset) const {
13310  if (N->getOpcode() == X86ISD::Wrapper) {
13311    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
13312      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
13313      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
13314      return true;
13315    }
13316  }
13317  return TargetLowering::isGAPlusOffset(N, GA, Offset);
13318}
13319
13320/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
13321/// same as extracting the high 128-bit part of 256-bit vector and then
13322/// inserting the result into the low part of a new 256-bit vector
13323static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
13324  EVT VT = SVOp->getValueType(0);
13325  unsigned NumElems = VT.getVectorNumElements();
13326
13327  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
13328  for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
13329    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
13330        SVOp->getMaskElt(j) >= 0)
13331      return false;
13332
13333  return true;
13334}
13335
13336/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
13337/// same as extracting the low 128-bit part of 256-bit vector and then
13338/// inserting the result into the high part of a new 256-bit vector
13339static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
13340  EVT VT = SVOp->getValueType(0);
13341  unsigned NumElems = VT.getVectorNumElements();
13342
13343  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
13344  for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
13345    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
13346        SVOp->getMaskElt(j) >= 0)
13347      return false;
13348
13349  return true;
13350}
13351
13352/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
13353static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
13354                                        TargetLowering::DAGCombinerInfo &DCI,
13355                                        const X86Subtarget* Subtarget) {
13356  DebugLoc dl = N->getDebugLoc();
13357  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
13358  SDValue V1 = SVOp->getOperand(0);
13359  SDValue V2 = SVOp->getOperand(1);
13360  EVT VT = SVOp->getValueType(0);
13361  unsigned NumElems = VT.getVectorNumElements();
13362
13363  if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
13364      V2.getOpcode() == ISD::CONCAT_VECTORS) {
13365    //
13366    //                   0,0,0,...
13367    //                      |
13368    //    V      UNDEF    BUILD_VECTOR    UNDEF
13369    //     \      /           \           /
13370    //  CONCAT_VECTOR         CONCAT_VECTOR
13371    //         \                  /
13372    //          \                /
13373    //          RESULT: V + zero extended
13374    //
13375    if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
13376        V2.getOperand(1).getOpcode() != ISD::UNDEF ||
13377        V1.getOperand(1).getOpcode() != ISD::UNDEF)
13378      return SDValue();
13379
13380    if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
13381      return SDValue();
13382
13383    // To match the shuffle mask, the first half of the mask should
13384    // be exactly the first vector, and all the rest a splat with the
13385    // first element of the second one.
13386    for (unsigned i = 0; i != NumElems/2; ++i)
13387      if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
13388          !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
13389        return SDValue();
13390
13391    // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
13392    if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
13393      if (Ld->hasNUsesOfValue(1, 0)) {
13394        SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
13395        SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
13396        SDValue ResNode =
13397          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
13398                                  Ld->getMemoryVT(),
13399                                  Ld->getPointerInfo(),
13400                                  Ld->getAlignment(),
13401                                  false/*isVolatile*/, true/*ReadMem*/,
13402                                  false/*WriteMem*/);
13403        return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
13404      }
13405    }
13406
13407    // Emit a zeroed vector and insert the desired subvector on its
13408    // first half.
13409    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
13410    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
13411    return DCI.CombineTo(N, InsV);
13412  }
13413
13414  //===--------------------------------------------------------------------===//
13415  // Combine some shuffles into subvector extracts and inserts:
13416  //
13417
13418  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
13419  if (isShuffleHigh128VectorInsertLow(SVOp)) {
13420    SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
13421    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
13422    return DCI.CombineTo(N, InsV);
13423  }
13424
13425  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
13426  if (isShuffleLow128VectorInsertHigh(SVOp)) {
13427    SDValue V = Extract128BitVector(V1, 0, DAG, dl);
13428    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
13429    return DCI.CombineTo(N, InsV);
13430  }
13431
13432  return SDValue();
13433}
13434
13435/// PerformShuffleCombine - Performs several different shuffle combines.
13436static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
13437                                     TargetLowering::DAGCombinerInfo &DCI,
13438                                     const X86Subtarget *Subtarget) {
13439  DebugLoc dl = N->getDebugLoc();
13440  EVT VT = N->getValueType(0);
13441
13442  // Don't create instructions with illegal types after legalize types has run.
13443  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13444  if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
13445    return SDValue();
13446
13447  // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
13448  if (Subtarget->hasAVX() && VT.is256BitVector() &&
13449      N->getOpcode() == ISD::VECTOR_SHUFFLE)
13450    return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
13451
13452  // Only handle 128 wide vector from here on.
13453  if (!VT.is128BitVector())
13454    return SDValue();
13455
13456  // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
13457  // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
13458  // consecutive, non-overlapping, and in the right order.
13459  SmallVector<SDValue, 16> Elts;
13460  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
13461    Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
13462
13463  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
13464}
13465
13466
13467/// DCI, PerformTruncateCombine - Converts truncate operation to
13468/// a sequence of vector shuffle operations.
13469/// It is possible when we truncate 256-bit vector to 128-bit vector
13470
13471SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
13472                                                  DAGCombinerInfo &DCI) const {
13473  if (!DCI.isBeforeLegalizeOps())
13474    return SDValue();
13475
13476  if (!Subtarget->hasAVX())
13477    return SDValue();
13478
13479  EVT VT = N->getValueType(0);
13480  SDValue Op = N->getOperand(0);
13481  EVT OpVT = Op.getValueType();
13482  DebugLoc dl = N->getDebugLoc();
13483
13484  if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) {
13485
13486    if (Subtarget->hasAVX2()) {
13487      // AVX2: v4i64 -> v4i32
13488
13489      // VPERMD
13490      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
13491
13492      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op);
13493      Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32),
13494                                ShufMask);
13495
13496      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op,
13497                         DAG.getIntPtrConstant(0));
13498    }
13499
13500    // AVX: v4i64 -> v4i32
13501    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
13502                               DAG.getIntPtrConstant(0));
13503
13504    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
13505                               DAG.getIntPtrConstant(2));
13506
13507    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
13508    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
13509
13510    // PSHUFD
13511    static const int ShufMask1[] = {0, 2, 0, 0};
13512
13513    SDValue Undef = DAG.getUNDEF(VT);
13514    OpLo = DAG.getVectorShuffle(VT, dl, OpLo, Undef, ShufMask1);
13515    OpHi = DAG.getVectorShuffle(VT, dl, OpHi, Undef, ShufMask1);
13516
13517    // MOVLHPS
13518    static const int ShufMask2[] = {0, 1, 4, 5};
13519
13520    return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2);
13521  }
13522
13523  if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
13524
13525    if (Subtarget->hasAVX2()) {
13526      // AVX2: v8i32 -> v8i16
13527
13528      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op);
13529
13530      // PSHUFB
13531      SmallVector<SDValue,32> pshufbMask;
13532      for (unsigned i = 0; i < 2; ++i) {
13533        pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
13534        pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
13535        pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
13536        pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
13537        pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
13538        pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
13539        pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
13540        pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
13541        for (unsigned j = 0; j < 8; ++j)
13542          pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
13543      }
13544      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8,
13545                               &pshufbMask[0], 32);
13546      Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV);
13547
13548      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op);
13549
13550      static const int ShufMask[] = {0,  2,  -1,  -1};
13551      Op = DAG.getVectorShuffle(MVT::v4i64, dl,  Op, DAG.getUNDEF(MVT::v4i64),
13552                                &ShufMask[0]);
13553
13554      Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
13555                       DAG.getIntPtrConstant(0));
13556
13557      return DAG.getNode(ISD::BITCAST, dl, VT, Op);
13558    }
13559
13560    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
13561                               DAG.getIntPtrConstant(0));
13562
13563    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
13564                               DAG.getIntPtrConstant(4));
13565
13566    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo);
13567    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi);
13568
13569    // PSHUFB
13570    static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
13571                                   -1, -1, -1, -1, -1, -1, -1, -1};
13572
13573    SDValue Undef = DAG.getUNDEF(MVT::v16i8);
13574    OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, Undef, ShufMask1);
13575    OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, Undef, ShufMask1);
13576
13577    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
13578    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
13579
13580    // MOVLHPS
13581    static const int ShufMask2[] = {0, 1, 4, 5};
13582
13583    SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2);
13584    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res);
13585  }
13586
13587  return SDValue();
13588}
13589
13590/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
13591/// specific shuffle of a load can be folded into a single element load.
13592/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
13593/// shuffles have been customed lowered so we need to handle those here.
13594static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
13595                                         TargetLowering::DAGCombinerInfo &DCI) {
13596  if (DCI.isBeforeLegalizeOps())
13597    return SDValue();
13598
13599  SDValue InVec = N->getOperand(0);
13600  SDValue EltNo = N->getOperand(1);
13601
13602  if (!isa<ConstantSDNode>(EltNo))
13603    return SDValue();
13604
13605  EVT VT = InVec.getValueType();
13606
13607  bool HasShuffleIntoBitcast = false;
13608  if (InVec.getOpcode() == ISD::BITCAST) {
13609    // Don't duplicate a load with other uses.
13610    if (!InVec.hasOneUse())
13611      return SDValue();
13612    EVT BCVT = InVec.getOperand(0).getValueType();
13613    if (BCVT.getVectorNumElements() != VT.getVectorNumElements())
13614      return SDValue();
13615    InVec = InVec.getOperand(0);
13616    HasShuffleIntoBitcast = true;
13617  }
13618
13619  if (!isTargetShuffle(InVec.getOpcode()))
13620    return SDValue();
13621
13622  // Don't duplicate a load with other uses.
13623  if (!InVec.hasOneUse())
13624    return SDValue();
13625
13626  SmallVector<int, 16> ShuffleMask;
13627  bool UnaryShuffle;
13628  if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask,
13629                            UnaryShuffle))
13630    return SDValue();
13631
13632  // Select the input vector, guarding against out of range extract vector.
13633  unsigned NumElems = VT.getVectorNumElements();
13634  int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
13635  int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
13636  SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
13637                                         : InVec.getOperand(1);
13638
13639  // If inputs to shuffle are the same for both ops, then allow 2 uses
13640  unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
13641
13642  if (LdNode.getOpcode() == ISD::BITCAST) {
13643    // Don't duplicate a load with other uses.
13644    if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
13645      return SDValue();
13646
13647    AllowedUses = 1; // only allow 1 load use if we have a bitcast
13648    LdNode = LdNode.getOperand(0);
13649  }
13650
13651  if (!ISD::isNormalLoad(LdNode.getNode()))
13652    return SDValue();
13653
13654  LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
13655
13656  if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
13657    return SDValue();
13658
13659  if (HasShuffleIntoBitcast) {
13660    // If there's a bitcast before the shuffle, check if the load type and
13661    // alignment is valid.
13662    unsigned Align = LN0->getAlignment();
13663    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13664    unsigned NewAlign = TLI.getTargetData()->
13665      getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
13666
13667    if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
13668      return SDValue();
13669  }
13670
13671  // All checks match so transform back to vector_shuffle so that DAG combiner
13672  // can finish the job
13673  DebugLoc dl = N->getDebugLoc();
13674
13675  // Create shuffle node taking into account the case that its a unary shuffle
13676  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
13677  Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl,
13678                                 InVec.getOperand(0), Shuffle,
13679                                 &ShuffleMask[0]);
13680  Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
13681  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
13682                     EltNo);
13683}
13684
13685/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
13686/// generation and convert it from being a bunch of shuffles and extracts
13687/// to a simple store and scalar loads to extract the elements.
13688static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
13689                                         TargetLowering::DAGCombinerInfo &DCI) {
13690  SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
13691  if (NewOp.getNode())
13692    return NewOp;
13693
13694  SDValue InputVector = N->getOperand(0);
13695
13696  // Only operate on vectors of 4 elements, where the alternative shuffling
13697  // gets to be more expensive.
13698  if (InputVector.getValueType() != MVT::v4i32)
13699    return SDValue();
13700
13701  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
13702  // single use which is a sign-extend or zero-extend, and all elements are
13703  // used.
13704  SmallVector<SDNode *, 4> Uses;
13705  unsigned ExtractedElements = 0;
13706  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
13707       UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
13708    if (UI.getUse().getResNo() != InputVector.getResNo())
13709      return SDValue();
13710
13711    SDNode *Extract = *UI;
13712    if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13713      return SDValue();
13714
13715    if (Extract->getValueType(0) != MVT::i32)
13716      return SDValue();
13717    if (!Extract->hasOneUse())
13718      return SDValue();
13719    if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
13720        Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
13721      return SDValue();
13722    if (!isa<ConstantSDNode>(Extract->getOperand(1)))
13723      return SDValue();
13724
13725    // Record which element was extracted.
13726    ExtractedElements |=
13727      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
13728
13729    Uses.push_back(Extract);
13730  }
13731
13732  // If not all the elements were used, this may not be worthwhile.
13733  if (ExtractedElements != 15)
13734    return SDValue();
13735
13736  // Ok, we've now decided to do the transformation.
13737  DebugLoc dl = InputVector.getDebugLoc();
13738
13739  // Store the value to a temporary stack slot.
13740  SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
13741  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
13742                            MachinePointerInfo(), false, false, 0);
13743
13744  // Replace each use (extract) with a load of the appropriate element.
13745  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
13746       UE = Uses.end(); UI != UE; ++UI) {
13747    SDNode *Extract = *UI;
13748
13749    // cOMpute the element's address.
13750    SDValue Idx = Extract->getOperand(1);
13751    unsigned EltSize =
13752        InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
13753    uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
13754    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13755    SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
13756
13757    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
13758                                     StackPtr, OffsetVal);
13759
13760    // Load the scalar.
13761    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
13762                                     ScalarAddr, MachinePointerInfo(),
13763                                     false, false, false, 0);
13764
13765    // Replace the exact with the load.
13766    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
13767  }
13768
13769  // The replacement was made in place; don't return anything.
13770  return SDValue();
13771}
13772
13773/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
13774/// nodes.
13775static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
13776                                    TargetLowering::DAGCombinerInfo &DCI,
13777                                    const X86Subtarget *Subtarget) {
13778  DebugLoc DL = N->getDebugLoc();
13779  SDValue Cond = N->getOperand(0);
13780  // Get the LHS/RHS of the select.
13781  SDValue LHS = N->getOperand(1);
13782  SDValue RHS = N->getOperand(2);
13783  EVT VT = LHS.getValueType();
13784
13785  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
13786  // instructions match the semantics of the common C idiom x<y?x:y but not
13787  // x<=y?x:y, because of how they handle negative zero (which can be
13788  // ignored in unsafe-math mode).
13789  if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
13790      VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
13791      (Subtarget->hasSSE2() ||
13792       (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
13793    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
13794
13795    unsigned Opcode = 0;
13796    // Check for x CC y ? x : y.
13797    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
13798        DAG.isEqualTo(RHS, Cond.getOperand(1))) {
13799      switch (CC) {
13800      default: break;
13801      case ISD::SETULT:
13802        // Converting this to a min would handle NaNs incorrectly, and swapping
13803        // the operands would cause it to handle comparisons between positive
13804        // and negative zero incorrectly.
13805        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
13806          if (!DAG.getTarget().Options.UnsafeFPMath &&
13807              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
13808            break;
13809          std::swap(LHS, RHS);
13810        }
13811        Opcode = X86ISD::FMIN;
13812        break;
13813      case ISD::SETOLE:
13814        // Converting this to a min would handle comparisons between positive
13815        // and negative zero incorrectly.
13816        if (!DAG.getTarget().Options.UnsafeFPMath &&
13817            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
13818          break;
13819        Opcode = X86ISD::FMIN;
13820        break;
13821      case ISD::SETULE:
13822        // Converting this to a min would handle both negative zeros and NaNs
13823        // incorrectly, but we can swap the operands to fix both.
13824        std::swap(LHS, RHS);
13825      case ISD::SETOLT:
13826      case ISD::SETLT:
13827      case ISD::SETLE:
13828        Opcode = X86ISD::FMIN;
13829        break;
13830
13831      case ISD::SETOGE:
13832        // Converting this to a max would handle comparisons between positive
13833        // and negative zero incorrectly.
13834        if (!DAG.getTarget().Options.UnsafeFPMath &&
13835            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
13836          break;
13837        Opcode = X86ISD::FMAX;
13838        break;
13839      case ISD::SETUGT:
13840        // Converting this to a max would handle NaNs incorrectly, and swapping
13841        // the operands would cause it to handle comparisons between positive
13842        // and negative zero incorrectly.
13843        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
13844          if (!DAG.getTarget().Options.UnsafeFPMath &&
13845              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
13846            break;
13847          std::swap(LHS, RHS);
13848        }
13849        Opcode = X86ISD::FMAX;
13850        break;
13851      case ISD::SETUGE:
13852        // Converting this to a max would handle both negative zeros and NaNs
13853        // incorrectly, but we can swap the operands to fix both.
13854        std::swap(LHS, RHS);
13855      case ISD::SETOGT:
13856      case ISD::SETGT:
13857      case ISD::SETGE:
13858        Opcode = X86ISD::FMAX;
13859        break;
13860      }
13861    // Check for x CC y ? y : x -- a min/max with reversed arms.
13862    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
13863               DAG.isEqualTo(RHS, Cond.getOperand(0))) {
13864      switch (CC) {
13865      default: break;
13866      case ISD::SETOGE:
13867        // Converting this to a min would handle comparisons between positive
13868        // and negative zero incorrectly, and swapping the operands would
13869        // cause it to handle NaNs incorrectly.
13870        if (!DAG.getTarget().Options.UnsafeFPMath &&
13871            !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
13872          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
13873            break;
13874          std::swap(LHS, RHS);
13875        }
13876        Opcode = X86ISD::FMIN;
13877        break;
13878      case ISD::SETUGT:
13879        // Converting this to a min would handle NaNs incorrectly.
13880        if (!DAG.getTarget().Options.UnsafeFPMath &&
13881            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
13882          break;
13883        Opcode = X86ISD::FMIN;
13884        break;
13885      case ISD::SETUGE:
13886        // Converting this to a min would handle both negative zeros and NaNs
13887        // incorrectly, but we can swap the operands to fix both.
13888        std::swap(LHS, RHS);
13889      case ISD::SETOGT:
13890      case ISD::SETGT:
13891      case ISD::SETGE:
13892        Opcode = X86ISD::FMIN;
13893        break;
13894
13895      case ISD::SETULT:
13896        // Converting this to a max would handle NaNs incorrectly.
13897        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
13898          break;
13899        Opcode = X86ISD::FMAX;
13900        break;
13901      case ISD::SETOLE:
13902        // Converting this to a max would handle comparisons between positive
13903        // and negative zero incorrectly, and swapping the operands would
13904        // cause it to handle NaNs incorrectly.
13905        if (!DAG.getTarget().Options.UnsafeFPMath &&
13906            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
13907          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
13908            break;
13909          std::swap(LHS, RHS);
13910        }
13911        Opcode = X86ISD::FMAX;
13912        break;
13913      case ISD::SETULE:
13914        // Converting this to a max would handle both negative zeros and NaNs
13915        // incorrectly, but we can swap the operands to fix both.
13916        std::swap(LHS, RHS);
13917      case ISD::SETOLT:
13918      case ISD::SETLT:
13919      case ISD::SETLE:
13920        Opcode = X86ISD::FMAX;
13921        break;
13922      }
13923    }
13924
13925    if (Opcode)
13926      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
13927  }
13928
13929  // If this is a select between two integer constants, try to do some
13930  // optimizations.
13931  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
13932    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
13933      // Don't do this for crazy integer types.
13934      if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
13935        // If this is efficiently invertible, canonicalize the LHSC/RHSC values
13936        // so that TrueC (the true value) is larger than FalseC.
13937        bool NeedsCondInvert = false;
13938
13939        if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
13940            // Efficiently invertible.
13941            (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
13942             (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
13943              isa<ConstantSDNode>(Cond.getOperand(1))))) {
13944          NeedsCondInvert = true;
13945          std::swap(TrueC, FalseC);
13946        }
13947
13948        // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
13949        if (FalseC->getAPIntValue() == 0 &&
13950            TrueC->getAPIntValue().isPowerOf2()) {
13951          if (NeedsCondInvert) // Invert the condition if needed.
13952            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
13953                               DAG.getConstant(1, Cond.getValueType()));
13954
13955          // Zero extend the condition if needed.
13956          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
13957
13958          unsigned ShAmt = TrueC->getAPIntValue().logBase2();
13959          return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
13960                             DAG.getConstant(ShAmt, MVT::i8));
13961        }
13962
13963        // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
13964        if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
13965          if (NeedsCondInvert) // Invert the condition if needed.
13966            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
13967                               DAG.getConstant(1, Cond.getValueType()));
13968
13969          // Zero extend the condition if needed.
13970          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
13971                             FalseC->getValueType(0), Cond);
13972          return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
13973                             SDValue(FalseC, 0));
13974        }
13975
13976        // Optimize cases that will turn into an LEA instruction.  This requires
13977        // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
13978        if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
13979          uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
13980          if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
13981
13982          bool isFastMultiplier = false;
13983          if (Diff < 10) {
13984            switch ((unsigned char)Diff) {
13985              default: break;
13986              case 1:  // result = add base, cond
13987              case 2:  // result = lea base(    , cond*2)
13988              case 3:  // result = lea base(cond, cond*2)
13989              case 4:  // result = lea base(    , cond*4)
13990              case 5:  // result = lea base(cond, cond*4)
13991              case 8:  // result = lea base(    , cond*8)
13992              case 9:  // result = lea base(cond, cond*8)
13993                isFastMultiplier = true;
13994                break;
13995            }
13996          }
13997
13998          if (isFastMultiplier) {
13999            APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
14000            if (NeedsCondInvert) // Invert the condition if needed.
14001              Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
14002                                 DAG.getConstant(1, Cond.getValueType()));
14003
14004            // Zero extend the condition if needed.
14005            Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
14006                               Cond);
14007            // Scale the condition by the difference.
14008            if (Diff != 1)
14009              Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
14010                                 DAG.getConstant(Diff, Cond.getValueType()));
14011
14012            // Add the base if non-zero.
14013            if (FalseC->getAPIntValue() != 0)
14014              Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
14015                                 SDValue(FalseC, 0));
14016            return Cond;
14017          }
14018        }
14019      }
14020  }
14021
14022  // Canonicalize max and min:
14023  // (x > y) ? x : y -> (x >= y) ? x : y
14024  // (x < y) ? x : y -> (x <= y) ? x : y
14025  // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
14026  // the need for an extra compare
14027  // against zero. e.g.
14028  // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
14029  // subl   %esi, %edi
14030  // testl  %edi, %edi
14031  // movl   $0, %eax
14032  // cmovgl %edi, %eax
14033  // =>
14034  // xorl   %eax, %eax
14035  // subl   %esi, $edi
14036  // cmovsl %eax, %edi
14037  if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
14038      DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
14039      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
14040    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
14041    switch (CC) {
14042    default: break;
14043    case ISD::SETLT:
14044    case ISD::SETGT: {
14045      ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
14046      Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(),
14047                          Cond.getOperand(0), Cond.getOperand(1), NewCC);
14048      return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
14049    }
14050    }
14051  }
14052
14053  // If we know that this node is legal then we know that it is going to be
14054  // matched by one of the SSE/AVX BLEND instructions. These instructions only
14055  // depend on the highest bit in each word. Try to use SimplifyDemandedBits
14056  // to simplify previous instructions.
14057  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14058  if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
14059      !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
14060    unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
14061
14062    // Don't optimize vector selects that map to mask-registers.
14063    if (BitWidth == 1)
14064      return SDValue();
14065
14066    assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
14067    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
14068
14069    APInt KnownZero, KnownOne;
14070    TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
14071                                          DCI.isBeforeLegalizeOps());
14072    if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
14073        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
14074      DCI.CommitTargetLoweringOpt(TLO);
14075  }
14076
14077  return SDValue();
14078}
14079
14080// Check whether a boolean test is testing a boolean value generated by
14081// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
14082// code.
14083//
14084// Simplify the following patterns:
14085// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
14086// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
14087// to (Op EFLAGS Cond)
14088//
14089// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
14090// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
14091// to (Op EFLAGS !Cond)
14092//
14093// where Op could be BRCOND or CMOV.
14094//
14095static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
14096  // Quit if not CMP and SUB with its value result used.
14097  if (Cmp.getOpcode() != X86ISD::CMP &&
14098      (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
14099      return SDValue();
14100
14101  // Quit if not used as a boolean value.
14102  if (CC != X86::COND_E && CC != X86::COND_NE)
14103    return SDValue();
14104
14105  // Check CMP operands. One of them should be 0 or 1 and the other should be
14106  // an SetCC or extended from it.
14107  SDValue Op1 = Cmp.getOperand(0);
14108  SDValue Op2 = Cmp.getOperand(1);
14109
14110  SDValue SetCC;
14111  const ConstantSDNode* C = 0;
14112  bool needOppositeCond = (CC == X86::COND_E);
14113
14114  if ((C = dyn_cast<ConstantSDNode>(Op1)))
14115    SetCC = Op2;
14116  else if ((C = dyn_cast<ConstantSDNode>(Op2)))
14117    SetCC = Op1;
14118  else // Quit if all operands are not constants.
14119    return SDValue();
14120
14121  if (C->getZExtValue() == 1)
14122    needOppositeCond = !needOppositeCond;
14123  else if (C->getZExtValue() != 0)
14124    // Quit if the constant is neither 0 or 1.
14125    return SDValue();
14126
14127  // Skip 'zext' node.
14128  if (SetCC.getOpcode() == ISD::ZERO_EXTEND)
14129    SetCC = SetCC.getOperand(0);
14130
14131  // Quit if not SETCC.
14132  // FIXME: So far we only handle the boolean value generated from SETCC. If
14133  // there is other ways to generate boolean values, we need handle them here
14134  // as well.
14135  if (SetCC.getOpcode() != X86ISD::SETCC)
14136    return SDValue();
14137
14138  // Set the condition code or opposite one if necessary.
14139  CC = X86::CondCode(SetCC.getConstantOperandVal(0));
14140  if (needOppositeCond)
14141    CC = X86::GetOppositeBranchCondition(CC);
14142
14143  return SetCC.getOperand(1);
14144}
14145
14146/// checkFlaggedOrCombine - DAG combination on X86ISD::OR, i.e. with EFLAGS
14147/// updated. If only flag result is used and the result is evaluated from a
14148/// series of element extraction, try to combine it into a PTEST.
14149static SDValue checkFlaggedOrCombine(SDValue Or, X86::CondCode &CC,
14150                                     SelectionDAG &DAG,
14151                                     const X86Subtarget *Subtarget) {
14152  SDNode *N = Or.getNode();
14153  DebugLoc DL = N->getDebugLoc();
14154
14155  // Only SSE4.1 and beyond supports PTEST or like.
14156  if (!Subtarget->hasSSE41())
14157    return SDValue();
14158
14159  if (N->getOpcode() != X86ISD::OR)
14160    return SDValue();
14161
14162  // Quit if the value result of OR is used.
14163  if (N->hasAnyUseOfValue(0))
14164    return SDValue();
14165
14166  // Quit if not used as a boolean value.
14167  if (CC != X86::COND_E && CC != X86::COND_NE)
14168    return SDValue();
14169
14170  SmallVector<SDValue, 8> Opnds;
14171  SDValue VecIn;
14172  EVT VT = MVT::Other;
14173  unsigned Mask = 0;
14174
14175  // Recognize a special case where a vector is casted into wide integer to
14176  // test all 0s.
14177  Opnds.push_back(N->getOperand(0));
14178  Opnds.push_back(N->getOperand(1));
14179
14180  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
14181    SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
14182    // BFS traverse all OR'd operands.
14183    if (I->getOpcode() == ISD::OR) {
14184      Opnds.push_back(I->getOperand(0));
14185      Opnds.push_back(I->getOperand(1));
14186      // Re-evaluate the number of nodes to be traversed.
14187      e += 2; // 2 more nodes (LHS and RHS) are pushed.
14188      continue;
14189    }
14190
14191    // Quit if a non-EXTRACT_VECTOR_ELT
14192    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14193      return SDValue();
14194
14195    // Quit if without a constant index.
14196    SDValue Idx = I->getOperand(1);
14197    if (!isa<ConstantSDNode>(Idx))
14198      return SDValue();
14199
14200    // Check if all elements are extracted from the same vector.
14201    SDValue ExtractedFromVec = I->getOperand(0);
14202    if (VecIn.getNode() == 0) {
14203      VT = ExtractedFromVec.getValueType();
14204      // FIXME: only 128-bit vector is supported so far.
14205      if (!VT.is128BitVector())
14206        return SDValue();
14207      VecIn = ExtractedFromVec;
14208    } else if (VecIn != ExtractedFromVec)
14209      return SDValue();
14210
14211    // Record the constant index.
14212    Mask |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
14213  }
14214
14215  assert(VT.is128BitVector() && "Only 128-bit vector PTEST is supported so far.");
14216
14217  // Quit if not all elements are used.
14218  if (Mask != (1U << VT.getVectorNumElements()) - 1U)
14219    return SDValue();
14220
14221  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIn, VecIn);
14222}
14223
14224/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
14225static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
14226                                  TargetLowering::DAGCombinerInfo &DCI,
14227                                  const X86Subtarget *Subtarget) {
14228  DebugLoc DL = N->getDebugLoc();
14229
14230  // If the flag operand isn't dead, don't touch this CMOV.
14231  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
14232    return SDValue();
14233
14234  SDValue FalseOp = N->getOperand(0);
14235  SDValue TrueOp = N->getOperand(1);
14236  X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
14237  SDValue Cond = N->getOperand(3);
14238
14239  if (CC == X86::COND_E || CC == X86::COND_NE) {
14240    switch (Cond.getOpcode()) {
14241    default: break;
14242    case X86ISD::BSR:
14243    case X86ISD::BSF:
14244      // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
14245      if (DAG.isKnownNeverZero(Cond.getOperand(0)))
14246        return (CC == X86::COND_E) ? FalseOp : TrueOp;
14247    }
14248  }
14249
14250  SDValue Flags;
14251
14252  Flags = checkBoolTestSetCCCombine(Cond, CC);
14253  if (Flags.getNode() &&
14254      // Extra check as FCMOV only supports a subset of X86 cond.
14255      (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
14256    SDValue Ops[] = { FalseOp, TrueOp,
14257                      DAG.getConstant(CC, MVT::i8), Flags };
14258    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
14259                       Ops, array_lengthof(Ops));
14260  }
14261
14262  Flags = checkFlaggedOrCombine(Cond, CC, DAG, Subtarget);
14263  if (Flags.getNode()) {
14264    SDValue Ops[] = { FalseOp, TrueOp,
14265                      DAG.getConstant(CC, MVT::i8), Flags };
14266    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
14267                       Ops, array_lengthof(Ops));
14268  }
14269
14270  // If this is a select between two integer constants, try to do some
14271  // optimizations.  Note that the operands are ordered the opposite of SELECT
14272  // operands.
14273  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
14274    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
14275      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
14276      // larger than FalseC (the false value).
14277      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
14278        CC = X86::GetOppositeBranchCondition(CC);
14279        std::swap(TrueC, FalseC);
14280      }
14281
14282      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
14283      // This is efficient for any integer data type (including i8/i16) and
14284      // shift amount.
14285      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
14286        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
14287                           DAG.getConstant(CC, MVT::i8), Cond);
14288
14289        // Zero extend the condition if needed.
14290        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
14291
14292        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
14293        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
14294                           DAG.getConstant(ShAmt, MVT::i8));
14295        if (N->getNumValues() == 2)  // Dead flag value?
14296          return DCI.CombineTo(N, Cond, SDValue());
14297        return Cond;
14298      }
14299
14300      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
14301      // for any integer data type, including i8/i16.
14302      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
14303        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
14304                           DAG.getConstant(CC, MVT::i8), Cond);
14305
14306        // Zero extend the condition if needed.
14307        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
14308                           FalseC->getValueType(0), Cond);
14309        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
14310                           SDValue(FalseC, 0));
14311
14312        if (N->getNumValues() == 2)  // Dead flag value?
14313          return DCI.CombineTo(N, Cond, SDValue());
14314        return Cond;
14315      }
14316
14317      // Optimize cases that will turn into an LEA instruction.  This requires
14318      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
14319      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
14320        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
14321        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
14322
14323        bool isFastMultiplier = false;
14324        if (Diff < 10) {
14325          switch ((unsigned char)Diff) {
14326          default: break;
14327          case 1:  // result = add base, cond
14328          case 2:  // result = lea base(    , cond*2)
14329          case 3:  // result = lea base(cond, cond*2)
14330          case 4:  // result = lea base(    , cond*4)
14331          case 5:  // result = lea base(cond, cond*4)
14332          case 8:  // result = lea base(    , cond*8)
14333          case 9:  // result = lea base(cond, cond*8)
14334            isFastMultiplier = true;
14335            break;
14336          }
14337        }
14338
14339        if (isFastMultiplier) {
14340          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
14341          Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
14342                             DAG.getConstant(CC, MVT::i8), Cond);
14343          // Zero extend the condition if needed.
14344          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
14345                             Cond);
14346          // Scale the condition by the difference.
14347          if (Diff != 1)
14348            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
14349                               DAG.getConstant(Diff, Cond.getValueType()));
14350
14351          // Add the base if non-zero.
14352          if (FalseC->getAPIntValue() != 0)
14353            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
14354                               SDValue(FalseC, 0));
14355          if (N->getNumValues() == 2)  // Dead flag value?
14356            return DCI.CombineTo(N, Cond, SDValue());
14357          return Cond;
14358        }
14359      }
14360    }
14361  }
14362  return SDValue();
14363}
14364
14365
14366/// PerformMulCombine - Optimize a single multiply with constant into two
14367/// in order to implement it with two cheaper instructions, e.g.
14368/// LEA + SHL, LEA + LEA.
14369static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
14370                                 TargetLowering::DAGCombinerInfo &DCI) {
14371  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14372    return SDValue();
14373
14374  EVT VT = N->getValueType(0);
14375  if (VT != MVT::i64)
14376    return SDValue();
14377
14378  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14379  if (!C)
14380    return SDValue();
14381  uint64_t MulAmt = C->getZExtValue();
14382  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
14383    return SDValue();
14384
14385  uint64_t MulAmt1 = 0;
14386  uint64_t MulAmt2 = 0;
14387  if ((MulAmt % 9) == 0) {
14388    MulAmt1 = 9;
14389    MulAmt2 = MulAmt / 9;
14390  } else if ((MulAmt % 5) == 0) {
14391    MulAmt1 = 5;
14392    MulAmt2 = MulAmt / 5;
14393  } else if ((MulAmt % 3) == 0) {
14394    MulAmt1 = 3;
14395    MulAmt2 = MulAmt / 3;
14396  }
14397  if (MulAmt2 &&
14398      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
14399    DebugLoc DL = N->getDebugLoc();
14400
14401    if (isPowerOf2_64(MulAmt2) &&
14402        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
14403      // If second multiplifer is pow2, issue it first. We want the multiply by
14404      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
14405      // is an add.
14406      std::swap(MulAmt1, MulAmt2);
14407
14408    SDValue NewMul;
14409    if (isPowerOf2_64(MulAmt1))
14410      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
14411                           DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
14412    else
14413      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
14414                           DAG.getConstant(MulAmt1, VT));
14415
14416    if (isPowerOf2_64(MulAmt2))
14417      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
14418                           DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
14419    else
14420      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
14421                           DAG.getConstant(MulAmt2, VT));
14422
14423    // Do not add new nodes to DAG combiner worklist.
14424    DCI.CombineTo(N, NewMul, false);
14425  }
14426  return SDValue();
14427}
14428
14429static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
14430  SDValue N0 = N->getOperand(0);
14431  SDValue N1 = N->getOperand(1);
14432  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
14433  EVT VT = N0.getValueType();
14434
14435  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
14436  // since the result of setcc_c is all zero's or all ones.
14437  if (VT.isInteger() && !VT.isVector() &&
14438      N1C && N0.getOpcode() == ISD::AND &&
14439      N0.getOperand(1).getOpcode() == ISD::Constant) {
14440    SDValue N00 = N0.getOperand(0);
14441    if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
14442        ((N00.getOpcode() == ISD::ANY_EXTEND ||
14443          N00.getOpcode() == ISD::ZERO_EXTEND) &&
14444         N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
14445      APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
14446      APInt ShAmt = N1C->getAPIntValue();
14447      Mask = Mask.shl(ShAmt);
14448      if (Mask != 0)
14449        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
14450                           N00, DAG.getConstant(Mask, VT));
14451    }
14452  }
14453
14454
14455  // Hardware support for vector shifts is sparse which makes us scalarize the
14456  // vector operations in many cases. Also, on sandybridge ADD is faster than
14457  // shl.
14458  // (shl V, 1) -> add V,V
14459  if (isSplatVector(N1.getNode())) {
14460    assert(N0.getValueType().isVector() && "Invalid vector shift type");
14461    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0));
14462    // We shift all of the values by one. In many cases we do not have
14463    // hardware support for this operation. This is better expressed as an ADD
14464    // of two values.
14465    if (N1C && (1 == N1C->getZExtValue())) {
14466      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0);
14467    }
14468  }
14469
14470  return SDValue();
14471}
14472
14473/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
14474///                       when possible.
14475static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
14476                                   TargetLowering::DAGCombinerInfo &DCI,
14477                                   const X86Subtarget *Subtarget) {
14478  EVT VT = N->getValueType(0);
14479  if (N->getOpcode() == ISD::SHL) {
14480    SDValue V = PerformSHLCombine(N, DAG);
14481    if (V.getNode()) return V;
14482  }
14483
14484  // On X86 with SSE2 support, we can transform this to a vector shift if
14485  // all elements are shifted by the same amount.  We can't do this in legalize
14486  // because the a constant vector is typically transformed to a constant pool
14487  // so we have no knowledge of the shift amount.
14488  if (!Subtarget->hasSSE2())
14489    return SDValue();
14490
14491  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
14492      (!Subtarget->hasAVX2() ||
14493       (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
14494    return SDValue();
14495
14496  SDValue ShAmtOp = N->getOperand(1);
14497  EVT EltVT = VT.getVectorElementType();
14498  DebugLoc DL = N->getDebugLoc();
14499  SDValue BaseShAmt = SDValue();
14500  if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
14501    unsigned NumElts = VT.getVectorNumElements();
14502    unsigned i = 0;
14503    for (; i != NumElts; ++i) {
14504      SDValue Arg = ShAmtOp.getOperand(i);
14505      if (Arg.getOpcode() == ISD::UNDEF) continue;
14506      BaseShAmt = Arg;
14507      break;
14508    }
14509    // Handle the case where the build_vector is all undef
14510    // FIXME: Should DAG allow this?
14511    if (i == NumElts)
14512      return SDValue();
14513
14514    for (; i != NumElts; ++i) {
14515      SDValue Arg = ShAmtOp.getOperand(i);
14516      if (Arg.getOpcode() == ISD::UNDEF) continue;
14517      if (Arg != BaseShAmt) {
14518        return SDValue();
14519      }
14520    }
14521  } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
14522             cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
14523    SDValue InVec = ShAmtOp.getOperand(0);
14524    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
14525      unsigned NumElts = InVec.getValueType().getVectorNumElements();
14526      unsigned i = 0;
14527      for (; i != NumElts; ++i) {
14528        SDValue Arg = InVec.getOperand(i);
14529        if (Arg.getOpcode() == ISD::UNDEF) continue;
14530        BaseShAmt = Arg;
14531        break;
14532      }
14533    } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
14534       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
14535         unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
14536         if (C->getZExtValue() == SplatIdx)
14537           BaseShAmt = InVec.getOperand(1);
14538       }
14539    }
14540    if (BaseShAmt.getNode() == 0) {
14541      // Don't create instructions with illegal types after legalize
14542      // types has run.
14543      if (!DAG.getTargetLoweringInfo().isTypeLegal(EltVT) &&
14544          !DCI.isBeforeLegalize())
14545        return SDValue();
14546
14547      BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
14548                              DAG.getIntPtrConstant(0));
14549    }
14550  } else
14551    return SDValue();
14552
14553  // The shift amount is an i32.
14554  if (EltVT.bitsGT(MVT::i32))
14555    BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
14556  else if (EltVT.bitsLT(MVT::i32))
14557    BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt);
14558
14559  // The shift amount is identical so we can do a vector shift.
14560  SDValue  ValOp = N->getOperand(0);
14561  switch (N->getOpcode()) {
14562  default:
14563    llvm_unreachable("Unknown shift opcode!");
14564  case ISD::SHL:
14565    switch (VT.getSimpleVT().SimpleTy) {
14566    default: return SDValue();
14567    case MVT::v2i64:
14568    case MVT::v4i32:
14569    case MVT::v8i16:
14570    case MVT::v4i64:
14571    case MVT::v8i32:
14572    case MVT::v16i16:
14573      return getTargetVShiftNode(X86ISD::VSHLI, DL, VT, ValOp, BaseShAmt, DAG);
14574    }
14575  case ISD::SRA:
14576    switch (VT.getSimpleVT().SimpleTy) {
14577    default: return SDValue();
14578    case MVT::v4i32:
14579    case MVT::v8i16:
14580    case MVT::v8i32:
14581    case MVT::v16i16:
14582      return getTargetVShiftNode(X86ISD::VSRAI, DL, VT, ValOp, BaseShAmt, DAG);
14583    }
14584  case ISD::SRL:
14585    switch (VT.getSimpleVT().SimpleTy) {
14586    default: return SDValue();
14587    case MVT::v2i64:
14588    case MVT::v4i32:
14589    case MVT::v8i16:
14590    case MVT::v4i64:
14591    case MVT::v8i32:
14592    case MVT::v16i16:
14593      return getTargetVShiftNode(X86ISD::VSRLI, DL, VT, ValOp, BaseShAmt, DAG);
14594    }
14595  }
14596}
14597
14598
14599// CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
14600// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
14601// and friends.  Likewise for OR -> CMPNEQSS.
14602static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
14603                            TargetLowering::DAGCombinerInfo &DCI,
14604                            const X86Subtarget *Subtarget) {
14605  unsigned opcode;
14606
14607  // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
14608  // we're requiring SSE2 for both.
14609  if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
14610    SDValue N0 = N->getOperand(0);
14611    SDValue N1 = N->getOperand(1);
14612    SDValue CMP0 = N0->getOperand(1);
14613    SDValue CMP1 = N1->getOperand(1);
14614    DebugLoc DL = N->getDebugLoc();
14615
14616    // The SETCCs should both refer to the same CMP.
14617    if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
14618      return SDValue();
14619
14620    SDValue CMP00 = CMP0->getOperand(0);
14621    SDValue CMP01 = CMP0->getOperand(1);
14622    EVT     VT    = CMP00.getValueType();
14623
14624    if (VT == MVT::f32 || VT == MVT::f64) {
14625      bool ExpectingFlags = false;
14626      // Check for any users that want flags:
14627      for (SDNode::use_iterator UI = N->use_begin(),
14628             UE = N->use_end();
14629           !ExpectingFlags && UI != UE; ++UI)
14630        switch (UI->getOpcode()) {
14631        default:
14632        case ISD::BR_CC:
14633        case ISD::BRCOND:
14634        case ISD::SELECT:
14635          ExpectingFlags = true;
14636          break;
14637        case ISD::CopyToReg:
14638        case ISD::SIGN_EXTEND:
14639        case ISD::ZERO_EXTEND:
14640        case ISD::ANY_EXTEND:
14641          break;
14642        }
14643
14644      if (!ExpectingFlags) {
14645        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
14646        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
14647
14648        if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
14649          X86::CondCode tmp = cc0;
14650          cc0 = cc1;
14651          cc1 = tmp;
14652        }
14653
14654        if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
14655            (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
14656          bool is64BitFP = (CMP00.getValueType() == MVT::f64);
14657          X86ISD::NodeType NTOperator = is64BitFP ?
14658            X86ISD::FSETCCsd : X86ISD::FSETCCss;
14659          // FIXME: need symbolic constants for these magic numbers.
14660          // See X86ATTInstPrinter.cpp:printSSECC().
14661          unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
14662          SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01,
14663                                              DAG.getConstant(x86cc, MVT::i8));
14664          SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32,
14665                                              OnesOrZeroesF);
14666          SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI,
14667                                      DAG.getConstant(1, MVT::i32));
14668          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
14669          return OneBitOfTruth;
14670        }
14671      }
14672    }
14673  }
14674  return SDValue();
14675}
14676
14677/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
14678/// so it can be folded inside ANDNP.
14679static bool CanFoldXORWithAllOnes(const SDNode *N) {
14680  EVT VT = N->getValueType(0);
14681
14682  // Match direct AllOnes for 128 and 256-bit vectors
14683  if (ISD::isBuildVectorAllOnes(N))
14684    return true;
14685
14686  // Look through a bit convert.
14687  if (N->getOpcode() == ISD::BITCAST)
14688    N = N->getOperand(0).getNode();
14689
14690  // Sometimes the operand may come from a insert_subvector building a 256-bit
14691  // allones vector
14692  if (VT.is256BitVector() &&
14693      N->getOpcode() == ISD::INSERT_SUBVECTOR) {
14694    SDValue V1 = N->getOperand(0);
14695    SDValue V2 = N->getOperand(1);
14696
14697    if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
14698        V1.getOperand(0).getOpcode() == ISD::UNDEF &&
14699        ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
14700        ISD::isBuildVectorAllOnes(V2.getNode()))
14701      return true;
14702  }
14703
14704  return false;
14705}
14706
14707static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
14708                                 TargetLowering::DAGCombinerInfo &DCI,
14709                                 const X86Subtarget *Subtarget) {
14710  if (DCI.isBeforeLegalizeOps())
14711    return SDValue();
14712
14713  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
14714  if (R.getNode())
14715    return R;
14716
14717  EVT VT = N->getValueType(0);
14718
14719  // Create ANDN, BLSI, and BLSR instructions
14720  // BLSI is X & (-X)
14721  // BLSR is X & (X-1)
14722  if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
14723    SDValue N0 = N->getOperand(0);
14724    SDValue N1 = N->getOperand(1);
14725    DebugLoc DL = N->getDebugLoc();
14726
14727    // Check LHS for not
14728    if (N0.getOpcode() == ISD::XOR && isAllOnes(N0.getOperand(1)))
14729      return DAG.getNode(X86ISD::ANDN, DL, VT, N0.getOperand(0), N1);
14730    // Check RHS for not
14731    if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1)))
14732      return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0);
14733
14734    // Check LHS for neg
14735    if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
14736        isZero(N0.getOperand(0)))
14737      return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
14738
14739    // Check RHS for neg
14740    if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
14741        isZero(N1.getOperand(0)))
14742      return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
14743
14744    // Check LHS for X-1
14745    if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
14746        isAllOnes(N0.getOperand(1)))
14747      return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
14748
14749    // Check RHS for X-1
14750    if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
14751        isAllOnes(N1.getOperand(1)))
14752      return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
14753
14754    return SDValue();
14755  }
14756
14757  // Want to form ANDNP nodes:
14758  // 1) In the hopes of then easily combining them with OR and AND nodes
14759  //    to form PBLEND/PSIGN.
14760  // 2) To match ANDN packed intrinsics
14761  if (VT != MVT::v2i64 && VT != MVT::v4i64)
14762    return SDValue();
14763
14764  SDValue N0 = N->getOperand(0);
14765  SDValue N1 = N->getOperand(1);
14766  DebugLoc DL = N->getDebugLoc();
14767
14768  // Check LHS for vnot
14769  if (N0.getOpcode() == ISD::XOR &&
14770      //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
14771      CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
14772    return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
14773
14774  // Check RHS for vnot
14775  if (N1.getOpcode() == ISD::XOR &&
14776      //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
14777      CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
14778    return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
14779
14780  return SDValue();
14781}
14782
14783static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
14784                                TargetLowering::DAGCombinerInfo &DCI,
14785                                const X86Subtarget *Subtarget) {
14786  if (DCI.isBeforeLegalizeOps())
14787    return SDValue();
14788
14789  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
14790  if (R.getNode())
14791    return R;
14792
14793  EVT VT = N->getValueType(0);
14794
14795  SDValue N0 = N->getOperand(0);
14796  SDValue N1 = N->getOperand(1);
14797
14798  // look for psign/blend
14799  if (VT == MVT::v2i64 || VT == MVT::v4i64) {
14800    if (!Subtarget->hasSSSE3() ||
14801        (VT == MVT::v4i64 && !Subtarget->hasAVX2()))
14802      return SDValue();
14803
14804    // Canonicalize pandn to RHS
14805    if (N0.getOpcode() == X86ISD::ANDNP)
14806      std::swap(N0, N1);
14807    // or (and (m, y), (pandn m, x))
14808    if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
14809      SDValue Mask = N1.getOperand(0);
14810      SDValue X    = N1.getOperand(1);
14811      SDValue Y;
14812      if (N0.getOperand(0) == Mask)
14813        Y = N0.getOperand(1);
14814      if (N0.getOperand(1) == Mask)
14815        Y = N0.getOperand(0);
14816
14817      // Check to see if the mask appeared in both the AND and ANDNP and
14818      if (!Y.getNode())
14819        return SDValue();
14820
14821      // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
14822      // Look through mask bitcast.
14823      if (Mask.getOpcode() == ISD::BITCAST)
14824        Mask = Mask.getOperand(0);
14825      if (X.getOpcode() == ISD::BITCAST)
14826        X = X.getOperand(0);
14827      if (Y.getOpcode() == ISD::BITCAST)
14828        Y = Y.getOperand(0);
14829
14830      EVT MaskVT = Mask.getValueType();
14831
14832      // Validate that the Mask operand is a vector sra node.
14833      // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
14834      // there is no psrai.b
14835      if (Mask.getOpcode() != X86ISD::VSRAI)
14836        return SDValue();
14837
14838      // Check that the SRA is all signbits.
14839      SDValue SraC = Mask.getOperand(1);
14840      unsigned SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
14841      unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
14842      if ((SraAmt + 1) != EltBits)
14843        return SDValue();
14844
14845      DebugLoc DL = N->getDebugLoc();
14846
14847      // Now we know we at least have a plendvb with the mask val.  See if
14848      // we can form a psignb/w/d.
14849      // psign = x.type == y.type == mask.type && y = sub(0, x);
14850      if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
14851          ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
14852          X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
14853        assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
14854               "Unsupported VT for PSIGN");
14855        Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
14856        return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
14857      }
14858      // PBLENDVB only available on SSE 4.1
14859      if (!Subtarget->hasSSE41())
14860        return SDValue();
14861
14862      EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
14863
14864      X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
14865      Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
14866      Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
14867      Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
14868      return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
14869    }
14870  }
14871
14872  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
14873    return SDValue();
14874
14875  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
14876  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
14877    std::swap(N0, N1);
14878  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
14879    return SDValue();
14880  if (!N0.hasOneUse() || !N1.hasOneUse())
14881    return SDValue();
14882
14883  SDValue ShAmt0 = N0.getOperand(1);
14884  if (ShAmt0.getValueType() != MVT::i8)
14885    return SDValue();
14886  SDValue ShAmt1 = N1.getOperand(1);
14887  if (ShAmt1.getValueType() != MVT::i8)
14888    return SDValue();
14889  if (ShAmt0.getOpcode() == ISD::TRUNCATE)
14890    ShAmt0 = ShAmt0.getOperand(0);
14891  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
14892    ShAmt1 = ShAmt1.getOperand(0);
14893
14894  DebugLoc DL = N->getDebugLoc();
14895  unsigned Opc = X86ISD::SHLD;
14896  SDValue Op0 = N0.getOperand(0);
14897  SDValue Op1 = N1.getOperand(0);
14898  if (ShAmt0.getOpcode() == ISD::SUB) {
14899    Opc = X86ISD::SHRD;
14900    std::swap(Op0, Op1);
14901    std::swap(ShAmt0, ShAmt1);
14902  }
14903
14904  unsigned Bits = VT.getSizeInBits();
14905  if (ShAmt1.getOpcode() == ISD::SUB) {
14906    SDValue Sum = ShAmt1.getOperand(0);
14907    if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
14908      SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
14909      if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
14910        ShAmt1Op1 = ShAmt1Op1.getOperand(0);
14911      if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
14912        return DAG.getNode(Opc, DL, VT,
14913                           Op0, Op1,
14914                           DAG.getNode(ISD::TRUNCATE, DL,
14915                                       MVT::i8, ShAmt0));
14916    }
14917  } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
14918    ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
14919    if (ShAmt0C &&
14920        ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
14921      return DAG.getNode(Opc, DL, VT,
14922                         N0.getOperand(0), N1.getOperand(0),
14923                         DAG.getNode(ISD::TRUNCATE, DL,
14924                                       MVT::i8, ShAmt0));
14925  }
14926
14927  return SDValue();
14928}
14929
14930// Generate NEG and CMOV for integer abs.
14931static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
14932  EVT VT = N->getValueType(0);
14933
14934  // Since X86 does not have CMOV for 8-bit integer, we don't convert
14935  // 8-bit integer abs to NEG and CMOV.
14936  if (VT.isInteger() && VT.getSizeInBits() == 8)
14937    return SDValue();
14938
14939  SDValue N0 = N->getOperand(0);
14940  SDValue N1 = N->getOperand(1);
14941  DebugLoc DL = N->getDebugLoc();
14942
14943  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
14944  // and change it to SUB and CMOV.
14945  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
14946      N0.getOpcode() == ISD::ADD &&
14947      N0.getOperand(1) == N1 &&
14948      N1.getOpcode() == ISD::SRA &&
14949      N1.getOperand(0) == N0.getOperand(0))
14950    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
14951      if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
14952        // Generate SUB & CMOV.
14953        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
14954                                  DAG.getConstant(0, VT), N0.getOperand(0));
14955
14956        SDValue Ops[] = { N0.getOperand(0), Neg,
14957                          DAG.getConstant(X86::COND_GE, MVT::i8),
14958                          SDValue(Neg.getNode(), 1) };
14959        return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue),
14960                           Ops, array_lengthof(Ops));
14961      }
14962  return SDValue();
14963}
14964
14965// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
14966static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
14967                                 TargetLowering::DAGCombinerInfo &DCI,
14968                                 const X86Subtarget *Subtarget) {
14969  if (DCI.isBeforeLegalizeOps())
14970    return SDValue();
14971
14972  if (Subtarget->hasCMov()) {
14973    SDValue RV = performIntegerAbsCombine(N, DAG);
14974    if (RV.getNode())
14975      return RV;
14976  }
14977
14978  // Try forming BMI if it is available.
14979  if (!Subtarget->hasBMI())
14980    return SDValue();
14981
14982  EVT VT = N->getValueType(0);
14983
14984  if (VT != MVT::i32 && VT != MVT::i64)
14985    return SDValue();
14986
14987  assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions");
14988
14989  // Create BLSMSK instructions by finding X ^ (X-1)
14990  SDValue N0 = N->getOperand(0);
14991  SDValue N1 = N->getOperand(1);
14992  DebugLoc DL = N->getDebugLoc();
14993
14994  if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
14995      isAllOnes(N0.getOperand(1)))
14996    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1);
14997
14998  if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
14999      isAllOnes(N1.getOperand(1)))
15000    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0);
15001
15002  return SDValue();
15003}
15004
15005/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
15006static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
15007                                  TargetLowering::DAGCombinerInfo &DCI,
15008                                  const X86Subtarget *Subtarget) {
15009  LoadSDNode *Ld = cast<LoadSDNode>(N);
15010  EVT RegVT = Ld->getValueType(0);
15011  EVT MemVT = Ld->getMemoryVT();
15012  DebugLoc dl = Ld->getDebugLoc();
15013  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15014
15015  ISD::LoadExtType Ext = Ld->getExtensionType();
15016
15017  // If this is a vector EXT Load then attempt to optimize it using a
15018  // shuffle. We need SSE4 for the shuffles.
15019  // TODO: It is possible to support ZExt by zeroing the undef values
15020  // during the shuffle phase or after the shuffle.
15021  if (RegVT.isVector() && RegVT.isInteger() &&
15022      Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) {
15023    assert(MemVT != RegVT && "Cannot extend to the same type");
15024    assert(MemVT.isVector() && "Must load a vector from memory");
15025
15026    unsigned NumElems = RegVT.getVectorNumElements();
15027    unsigned RegSz = RegVT.getSizeInBits();
15028    unsigned MemSz = MemVT.getSizeInBits();
15029    assert(RegSz > MemSz && "Register size must be greater than the mem size");
15030
15031    // All sizes must be a power of two.
15032    if (!isPowerOf2_32(RegSz * MemSz * NumElems))
15033      return SDValue();
15034
15035    // Attempt to load the original value using scalar loads.
15036    // Find the largest scalar type that divides the total loaded size.
15037    MVT SclrLoadTy = MVT::i8;
15038    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
15039         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
15040      MVT Tp = (MVT::SimpleValueType)tp;
15041      if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
15042        SclrLoadTy = Tp;
15043      }
15044    }
15045
15046    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
15047    if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
15048        (64 <= MemSz))
15049      SclrLoadTy = MVT::f64;
15050
15051    // Calculate the number of scalar loads that we need to perform
15052    // in order to load our vector from memory.
15053    unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
15054
15055    // Represent our vector as a sequence of elements which are the
15056    // largest scalar that we can load.
15057    EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
15058      RegSz/SclrLoadTy.getSizeInBits());
15059
15060    // Represent the data using the same element type that is stored in
15061    // memory. In practice, we ''widen'' MemVT.
15062    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
15063                                  RegSz/MemVT.getScalarType().getSizeInBits());
15064
15065    assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
15066      "Invalid vector type");
15067
15068    // We can't shuffle using an illegal type.
15069    if (!TLI.isTypeLegal(WideVecVT))
15070      return SDValue();
15071
15072    SmallVector<SDValue, 8> Chains;
15073    SDValue Ptr = Ld->getBasePtr();
15074    SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8,
15075                                        TLI.getPointerTy());
15076    SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
15077
15078    for (unsigned i = 0; i < NumLoads; ++i) {
15079      // Perform a single load.
15080      SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
15081                                       Ptr, Ld->getPointerInfo(),
15082                                       Ld->isVolatile(), Ld->isNonTemporal(),
15083                                       Ld->isInvariant(), Ld->getAlignment());
15084      Chains.push_back(ScalarLoad.getValue(1));
15085      // Create the first element type using SCALAR_TO_VECTOR in order to avoid
15086      // another round of DAGCombining.
15087      if (i == 0)
15088        Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
15089      else
15090        Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
15091                          ScalarLoad, DAG.getIntPtrConstant(i));
15092
15093      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
15094    }
15095
15096    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
15097                               Chains.size());
15098
15099    // Bitcast the loaded value to a vector of the original element type, in
15100    // the size of the target vector type.
15101    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
15102    unsigned SizeRatio = RegSz/MemSz;
15103
15104    // Redistribute the loaded elements into the different locations.
15105    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
15106    for (unsigned i = 0; i != NumElems; ++i)
15107      ShuffleVec[i*SizeRatio] = i;
15108
15109    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
15110                                         DAG.getUNDEF(WideVecVT),
15111                                         &ShuffleVec[0]);
15112
15113    // Bitcast to the requested type.
15114    Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
15115    // Replace the original load with the new sequence
15116    // and return the new chain.
15117    return DCI.CombineTo(N, Shuff, TF, true);
15118  }
15119
15120  return SDValue();
15121}
15122
15123/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
15124static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
15125                                   const X86Subtarget *Subtarget) {
15126  StoreSDNode *St = cast<StoreSDNode>(N);
15127  EVT VT = St->getValue().getValueType();
15128  EVT StVT = St->getMemoryVT();
15129  DebugLoc dl = St->getDebugLoc();
15130  SDValue StoredVal = St->getOperand(1);
15131  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15132
15133  // If we are saving a concatenation of two XMM registers, perform two stores.
15134  // On Sandy Bridge, 256-bit memory operations are executed by two
15135  // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
15136  // memory  operation.
15137  if (VT.is256BitVector() && !Subtarget->hasAVX2() &&
15138      StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
15139      StoredVal.getNumOperands() == 2) {
15140    SDValue Value0 = StoredVal.getOperand(0);
15141    SDValue Value1 = StoredVal.getOperand(1);
15142
15143    SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
15144    SDValue Ptr0 = St->getBasePtr();
15145    SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
15146
15147    SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
15148                                St->getPointerInfo(), St->isVolatile(),
15149                                St->isNonTemporal(), St->getAlignment());
15150    SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
15151                                St->getPointerInfo(), St->isVolatile(),
15152                                St->isNonTemporal(), St->getAlignment());
15153    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
15154  }
15155
15156  // Optimize trunc store (of multiple scalars) to shuffle and store.
15157  // First, pack all of the elements in one place. Next, store to memory
15158  // in fewer chunks.
15159  if (St->isTruncatingStore() && VT.isVector()) {
15160    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15161    unsigned NumElems = VT.getVectorNumElements();
15162    assert(StVT != VT && "Cannot truncate to the same type");
15163    unsigned FromSz = VT.getVectorElementType().getSizeInBits();
15164    unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
15165
15166    // From, To sizes and ElemCount must be pow of two
15167    if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
15168    // We are going to use the original vector elt for storing.
15169    // Accumulated smaller vector elements must be a multiple of the store size.
15170    if (0 != (NumElems * FromSz) % ToSz) return SDValue();
15171
15172    unsigned SizeRatio  = FromSz / ToSz;
15173
15174    assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
15175
15176    // Create a type on which we perform the shuffle
15177    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
15178            StVT.getScalarType(), NumElems*SizeRatio);
15179
15180    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
15181
15182    SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
15183    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
15184    for (unsigned i = 0; i != NumElems; ++i)
15185      ShuffleVec[i] = i * SizeRatio;
15186
15187    // Can't shuffle using an illegal type.
15188    if (!TLI.isTypeLegal(WideVecVT))
15189      return SDValue();
15190
15191    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
15192                                         DAG.getUNDEF(WideVecVT),
15193                                         &ShuffleVec[0]);
15194    // At this point all of the data is stored at the bottom of the
15195    // register. We now need to save it to mem.
15196
15197    // Find the largest store unit
15198    MVT StoreType = MVT::i8;
15199    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
15200         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
15201      MVT Tp = (MVT::SimpleValueType)tp;
15202      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
15203        StoreType = Tp;
15204    }
15205
15206    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
15207    if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
15208        (64 <= NumElems * ToSz))
15209      StoreType = MVT::f64;
15210
15211    // Bitcast the original vector into a vector of store-size units
15212    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
15213            StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
15214    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
15215    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
15216    SmallVector<SDValue, 8> Chains;
15217    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
15218                                        TLI.getPointerTy());
15219    SDValue Ptr = St->getBasePtr();
15220
15221    // Perform one or more big stores into memory.
15222    for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
15223      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
15224                                   StoreType, ShuffWide,
15225                                   DAG.getIntPtrConstant(i));
15226      SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
15227                                St->getPointerInfo(), St->isVolatile(),
15228                                St->isNonTemporal(), St->getAlignment());
15229      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
15230      Chains.push_back(Ch);
15231    }
15232
15233    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
15234                               Chains.size());
15235  }
15236
15237
15238  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
15239  // the FP state in cases where an emms may be missing.
15240  // A preferable solution to the general problem is to figure out the right
15241  // places to insert EMMS.  This qualifies as a quick hack.
15242
15243  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
15244  if (VT.getSizeInBits() != 64)
15245    return SDValue();
15246
15247  const Function *F = DAG.getMachineFunction().getFunction();
15248  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
15249  bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
15250                     && Subtarget->hasSSE2();
15251  if ((VT.isVector() ||
15252       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
15253      isa<LoadSDNode>(St->getValue()) &&
15254      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
15255      St->getChain().hasOneUse() && !St->isVolatile()) {
15256    SDNode* LdVal = St->getValue().getNode();
15257    LoadSDNode *Ld = 0;
15258    int TokenFactorIndex = -1;
15259    SmallVector<SDValue, 8> Ops;
15260    SDNode* ChainVal = St->getChain().getNode();
15261    // Must be a store of a load.  We currently handle two cases:  the load
15262    // is a direct child, and it's under an intervening TokenFactor.  It is
15263    // possible to dig deeper under nested TokenFactors.
15264    if (ChainVal == LdVal)
15265      Ld = cast<LoadSDNode>(St->getChain());
15266    else if (St->getValue().hasOneUse() &&
15267             ChainVal->getOpcode() == ISD::TokenFactor) {
15268      for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
15269        if (ChainVal->getOperand(i).getNode() == LdVal) {
15270          TokenFactorIndex = i;
15271          Ld = cast<LoadSDNode>(St->getValue());
15272        } else
15273          Ops.push_back(ChainVal->getOperand(i));
15274      }
15275    }
15276
15277    if (!Ld || !ISD::isNormalLoad(Ld))
15278      return SDValue();
15279
15280    // If this is not the MMX case, i.e. we are just turning i64 load/store
15281    // into f64 load/store, avoid the transformation if there are multiple
15282    // uses of the loaded value.
15283    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
15284      return SDValue();
15285
15286    DebugLoc LdDL = Ld->getDebugLoc();
15287    DebugLoc StDL = N->getDebugLoc();
15288    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
15289    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
15290    // pair instead.
15291    if (Subtarget->is64Bit() || F64IsLegal) {
15292      EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
15293      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
15294                                  Ld->getPointerInfo(), Ld->isVolatile(),
15295                                  Ld->isNonTemporal(), Ld->isInvariant(),
15296                                  Ld->getAlignment());
15297      SDValue NewChain = NewLd.getValue(1);
15298      if (TokenFactorIndex != -1) {
15299        Ops.push_back(NewChain);
15300        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
15301                               Ops.size());
15302      }
15303      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
15304                          St->getPointerInfo(),
15305                          St->isVolatile(), St->isNonTemporal(),
15306                          St->getAlignment());
15307    }
15308
15309    // Otherwise, lower to two pairs of 32-bit loads / stores.
15310    SDValue LoAddr = Ld->getBasePtr();
15311    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
15312                                 DAG.getConstant(4, MVT::i32));
15313
15314    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
15315                               Ld->getPointerInfo(),
15316                               Ld->isVolatile(), Ld->isNonTemporal(),
15317                               Ld->isInvariant(), Ld->getAlignment());
15318    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
15319                               Ld->getPointerInfo().getWithOffset(4),
15320                               Ld->isVolatile(), Ld->isNonTemporal(),
15321                               Ld->isInvariant(),
15322                               MinAlign(Ld->getAlignment(), 4));
15323
15324    SDValue NewChain = LoLd.getValue(1);
15325    if (TokenFactorIndex != -1) {
15326      Ops.push_back(LoLd);
15327      Ops.push_back(HiLd);
15328      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
15329                             Ops.size());
15330    }
15331
15332    LoAddr = St->getBasePtr();
15333    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
15334                         DAG.getConstant(4, MVT::i32));
15335
15336    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
15337                                St->getPointerInfo(),
15338                                St->isVolatile(), St->isNonTemporal(),
15339                                St->getAlignment());
15340    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
15341                                St->getPointerInfo().getWithOffset(4),
15342                                St->isVolatile(),
15343                                St->isNonTemporal(),
15344                                MinAlign(St->getAlignment(), 4));
15345    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
15346  }
15347  return SDValue();
15348}
15349
15350/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
15351/// and return the operands for the horizontal operation in LHS and RHS.  A
15352/// horizontal operation performs the binary operation on successive elements
15353/// of its first operand, then on successive elements of its second operand,
15354/// returning the resulting values in a vector.  For example, if
15355///   A = < float a0, float a1, float a2, float a3 >
15356/// and
15357///   B = < float b0, float b1, float b2, float b3 >
15358/// then the result of doing a horizontal operation on A and B is
15359///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
15360/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
15361/// A horizontal-op B, for some already available A and B, and if so then LHS is
15362/// set to A, RHS to B, and the routine returns 'true'.
15363/// Note that the binary operation should have the property that if one of the
15364/// operands is UNDEF then the result is UNDEF.
15365static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
15366  // Look for the following pattern: if
15367  //   A = < float a0, float a1, float a2, float a3 >
15368  //   B = < float b0, float b1, float b2, float b3 >
15369  // and
15370  //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
15371  //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
15372  // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
15373  // which is A horizontal-op B.
15374
15375  // At least one of the operands should be a vector shuffle.
15376  if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
15377      RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
15378    return false;
15379
15380  EVT VT = LHS.getValueType();
15381
15382  assert((VT.is128BitVector() || VT.is256BitVector()) &&
15383         "Unsupported vector type for horizontal add/sub");
15384
15385  // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
15386  // operate independently on 128-bit lanes.
15387  unsigned NumElts = VT.getVectorNumElements();
15388  unsigned NumLanes = VT.getSizeInBits()/128;
15389  unsigned NumLaneElts = NumElts / NumLanes;
15390  assert((NumLaneElts % 2 == 0) &&
15391         "Vector type should have an even number of elements in each lane");
15392  unsigned HalfLaneElts = NumLaneElts/2;
15393
15394  // View LHS in the form
15395  //   LHS = VECTOR_SHUFFLE A, B, LMask
15396  // If LHS is not a shuffle then pretend it is the shuffle
15397  //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
15398  // NOTE: in what follows a default initialized SDValue represents an UNDEF of
15399  // type VT.
15400  SDValue A, B;
15401  SmallVector<int, 16> LMask(NumElts);
15402  if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
15403    if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
15404      A = LHS.getOperand(0);
15405    if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
15406      B = LHS.getOperand(1);
15407    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
15408    std::copy(Mask.begin(), Mask.end(), LMask.begin());
15409  } else {
15410    if (LHS.getOpcode() != ISD::UNDEF)
15411      A = LHS;
15412    for (unsigned i = 0; i != NumElts; ++i)
15413      LMask[i] = i;
15414  }
15415
15416  // Likewise, view RHS in the form
15417  //   RHS = VECTOR_SHUFFLE C, D, RMask
15418  SDValue C, D;
15419  SmallVector<int, 16> RMask(NumElts);
15420  if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
15421    if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
15422      C = RHS.getOperand(0);
15423    if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
15424      D = RHS.getOperand(1);
15425    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
15426    std::copy(Mask.begin(), Mask.end(), RMask.begin());
15427  } else {
15428    if (RHS.getOpcode() != ISD::UNDEF)
15429      C = RHS;
15430    for (unsigned i = 0; i != NumElts; ++i)
15431      RMask[i] = i;
15432  }
15433
15434  // Check that the shuffles are both shuffling the same vectors.
15435  if (!(A == C && B == D) && !(A == D && B == C))
15436    return false;
15437
15438  // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
15439  if (!A.getNode() && !B.getNode())
15440    return false;
15441
15442  // If A and B occur in reverse order in RHS, then "swap" them (which means
15443  // rewriting the mask).
15444  if (A != C)
15445    CommuteVectorShuffleMask(RMask, NumElts);
15446
15447  // At this point LHS and RHS are equivalent to
15448  //   LHS = VECTOR_SHUFFLE A, B, LMask
15449  //   RHS = VECTOR_SHUFFLE A, B, RMask
15450  // Check that the masks correspond to performing a horizontal operation.
15451  for (unsigned i = 0; i != NumElts; ++i) {
15452    int LIdx = LMask[i], RIdx = RMask[i];
15453
15454    // Ignore any UNDEF components.
15455    if (LIdx < 0 || RIdx < 0 ||
15456        (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
15457        (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
15458      continue;
15459
15460    // Check that successive elements are being operated on.  If not, this is
15461    // not a horizontal operation.
15462    unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs
15463    unsigned LaneStart = (i/NumLaneElts) * NumLaneElts;
15464    int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart;
15465    if (!(LIdx == Index && RIdx == Index + 1) &&
15466        !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
15467      return false;
15468  }
15469
15470  LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
15471  RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
15472  return true;
15473}
15474
15475/// PerformFADDCombine - Do target-specific dag combines on floating point adds.
15476static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
15477                                  const X86Subtarget *Subtarget) {
15478  EVT VT = N->getValueType(0);
15479  SDValue LHS = N->getOperand(0);
15480  SDValue RHS = N->getOperand(1);
15481
15482  // Try to synthesize horizontal adds from adds of shuffles.
15483  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
15484       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
15485      isHorizontalBinOp(LHS, RHS, true))
15486    return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
15487  return SDValue();
15488}
15489
15490/// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
15491static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
15492                                  const X86Subtarget *Subtarget) {
15493  EVT VT = N->getValueType(0);
15494  SDValue LHS = N->getOperand(0);
15495  SDValue RHS = N->getOperand(1);
15496
15497  // Try to synthesize horizontal subs from subs of shuffles.
15498  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
15499       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
15500      isHorizontalBinOp(LHS, RHS, false))
15501    return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
15502  return SDValue();
15503}
15504
15505/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
15506/// X86ISD::FXOR nodes.
15507static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
15508  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
15509  // F[X]OR(0.0, x) -> x
15510  // F[X]OR(x, 0.0) -> x
15511  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
15512    if (C->getValueAPF().isPosZero())
15513      return N->getOperand(1);
15514  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
15515    if (C->getValueAPF().isPosZero())
15516      return N->getOperand(0);
15517  return SDValue();
15518}
15519
15520/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and
15521/// X86ISD::FMAX nodes.
15522static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
15523  assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
15524
15525  // Only perform optimizations if UnsafeMath is used.
15526  if (!DAG.getTarget().Options.UnsafeFPMath)
15527    return SDValue();
15528
15529  // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
15530  // into FMINC and FMAXC, which are Commutative operations.
15531  unsigned NewOp = 0;
15532  switch (N->getOpcode()) {
15533    default: llvm_unreachable("unknown opcode");
15534    case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
15535    case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
15536  }
15537
15538  return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0),
15539                     N->getOperand(0), N->getOperand(1));
15540}
15541
15542
15543/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
15544static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
15545  // FAND(0.0, x) -> 0.0
15546  // FAND(x, 0.0) -> 0.0
15547  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
15548    if (C->getValueAPF().isPosZero())
15549      return N->getOperand(0);
15550  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
15551    if (C->getValueAPF().isPosZero())
15552      return N->getOperand(1);
15553  return SDValue();
15554}
15555
15556static SDValue PerformBTCombine(SDNode *N,
15557                                SelectionDAG &DAG,
15558                                TargetLowering::DAGCombinerInfo &DCI) {
15559  // BT ignores high bits in the bit index operand.
15560  SDValue Op1 = N->getOperand(1);
15561  if (Op1.hasOneUse()) {
15562    unsigned BitWidth = Op1.getValueSizeInBits();
15563    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
15564    APInt KnownZero, KnownOne;
15565    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
15566                                          !DCI.isBeforeLegalizeOps());
15567    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15568    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
15569        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
15570      DCI.CommitTargetLoweringOpt(TLO);
15571  }
15572  return SDValue();
15573}
15574
15575static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
15576  SDValue Op = N->getOperand(0);
15577  if (Op.getOpcode() == ISD::BITCAST)
15578    Op = Op.getOperand(0);
15579  EVT VT = N->getValueType(0), OpVT = Op.getValueType();
15580  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
15581      VT.getVectorElementType().getSizeInBits() ==
15582      OpVT.getVectorElementType().getSizeInBits()) {
15583    return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
15584  }
15585  return SDValue();
15586}
15587
15588static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
15589                                  TargetLowering::DAGCombinerInfo &DCI,
15590                                  const X86Subtarget *Subtarget) {
15591  if (!DCI.isBeforeLegalizeOps())
15592    return SDValue();
15593
15594  if (!Subtarget->hasAVX())
15595    return SDValue();
15596
15597  EVT VT = N->getValueType(0);
15598  SDValue Op = N->getOperand(0);
15599  EVT OpVT = Op.getValueType();
15600  DebugLoc dl = N->getDebugLoc();
15601
15602  if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||
15603      (VT == MVT::v8i32 && OpVT == MVT::v8i16)) {
15604
15605    if (Subtarget->hasAVX2())
15606      return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op);
15607
15608    // Optimize vectors in AVX mode
15609    // Sign extend  v8i16 to v8i32 and
15610    //              v4i32 to v4i64
15611    //
15612    // Divide input vector into two parts
15613    // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
15614    // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
15615    // concat the vectors to original VT
15616
15617    unsigned NumElems = OpVT.getVectorNumElements();
15618    SDValue Undef = DAG.getUNDEF(OpVT);
15619
15620    SmallVector<int,8> ShufMask1(NumElems, -1);
15621    for (unsigned i = 0; i != NumElems/2; ++i)
15622      ShufMask1[i] = i;
15623
15624    SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask1[0]);
15625
15626    SmallVector<int,8> ShufMask2(NumElems, -1);
15627    for (unsigned i = 0; i != NumElems/2; ++i)
15628      ShufMask2[i] = i + NumElems/2;
15629
15630    SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask2[0]);
15631
15632    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
15633                                  VT.getVectorNumElements()/2);
15634
15635    OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
15636    OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
15637
15638    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15639  }
15640  return SDValue();
15641}
15642
15643static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
15644                                 const X86Subtarget* Subtarget) {
15645  DebugLoc dl = N->getDebugLoc();
15646  EVT VT = N->getValueType(0);
15647
15648  // Let legalize expand this if it isn't a legal type yet.
15649  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
15650    return SDValue();
15651
15652  EVT ScalarVT = VT.getScalarType();
15653  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
15654      (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
15655    return SDValue();
15656
15657  SDValue A = N->getOperand(0);
15658  SDValue B = N->getOperand(1);
15659  SDValue C = N->getOperand(2);
15660
15661  bool NegA = (A.getOpcode() == ISD::FNEG);
15662  bool NegB = (B.getOpcode() == ISD::FNEG);
15663  bool NegC = (C.getOpcode() == ISD::FNEG);
15664
15665  // Negative multiplication when NegA xor NegB
15666  bool NegMul = (NegA != NegB);
15667  if (NegA)
15668    A = A.getOperand(0);
15669  if (NegB)
15670    B = B.getOperand(0);
15671  if (NegC)
15672    C = C.getOperand(0);
15673
15674  unsigned Opcode;
15675  if (!NegMul)
15676    Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
15677  else
15678    Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
15679
15680  return DAG.getNode(Opcode, dl, VT, A, B, C);
15681}
15682
15683static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
15684                                  TargetLowering::DAGCombinerInfo &DCI,
15685                                  const X86Subtarget *Subtarget) {
15686  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
15687  //           (and (i32 x86isd::setcc_carry), 1)
15688  // This eliminates the zext. This transformation is necessary because
15689  // ISD::SETCC is always legalized to i8.
15690  DebugLoc dl = N->getDebugLoc();
15691  SDValue N0 = N->getOperand(0);
15692  EVT VT = N->getValueType(0);
15693  EVT OpVT = N0.getValueType();
15694
15695  if (N0.getOpcode() == ISD::AND &&
15696      N0.hasOneUse() &&
15697      N0.getOperand(0).hasOneUse()) {
15698    SDValue N00 = N0.getOperand(0);
15699    if (N00.getOpcode() != X86ISD::SETCC_CARRY)
15700      return SDValue();
15701    ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
15702    if (!C || C->getZExtValue() != 1)
15703      return SDValue();
15704    return DAG.getNode(ISD::AND, dl, VT,
15705                       DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
15706                                   N00.getOperand(0), N00.getOperand(1)),
15707                       DAG.getConstant(1, VT));
15708  }
15709
15710  // Optimize vectors in AVX mode:
15711  //
15712  //   v8i16 -> v8i32
15713  //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
15714  //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
15715  //   Concat upper and lower parts.
15716  //
15717  //   v4i32 -> v4i64
15718  //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
15719  //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
15720  //   Concat upper and lower parts.
15721  //
15722  if (!DCI.isBeforeLegalizeOps())
15723    return SDValue();
15724
15725  if (!Subtarget->hasAVX())
15726    return SDValue();
15727
15728  if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) ||
15729      ((VT == MVT::v4i64) && (OpVT == MVT::v4i32)))  {
15730
15731    if (Subtarget->hasAVX2())
15732      return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0);
15733
15734    SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
15735    SDValue OpLo = getUnpackl(DAG, dl, OpVT, N0, ZeroVec);
15736    SDValue OpHi = getUnpackh(DAG, dl, OpVT, N0, ZeroVec);
15737
15738    EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
15739                               VT.getVectorNumElements()/2);
15740
15741    OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
15742    OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
15743
15744    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15745  }
15746
15747  return SDValue();
15748}
15749
15750// Optimize x == -y --> x+y == 0
15751//          x != -y --> x+y != 0
15752static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
15753  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
15754  SDValue LHS = N->getOperand(0);
15755  SDValue RHS = N->getOperand(1);
15756
15757  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
15758    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
15759      if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
15760        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
15761                                   LHS.getValueType(), RHS, LHS.getOperand(1));
15762        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
15763                            addV, DAG.getConstant(0, addV.getValueType()), CC);
15764      }
15765  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
15766    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
15767      if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
15768        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
15769                                   RHS.getValueType(), LHS, RHS.getOperand(1));
15770        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
15771                            addV, DAG.getConstant(0, addV.getValueType()), CC);
15772      }
15773  return SDValue();
15774}
15775
15776// Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
15777static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
15778                                   TargetLowering::DAGCombinerInfo &DCI,
15779                                   const X86Subtarget *Subtarget) {
15780  DebugLoc DL = N->getDebugLoc();
15781  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
15782  SDValue EFLAGS = N->getOperand(1);
15783
15784  // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
15785  // a zext and produces an all-ones bit which is more useful than 0/1 in some
15786  // cases.
15787  if (CC == X86::COND_B)
15788    return DAG.getNode(ISD::AND, DL, MVT::i8,
15789                       DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
15790                                   DAG.getConstant(CC, MVT::i8), EFLAGS),
15791                       DAG.getConstant(1, MVT::i8));
15792
15793  SDValue Flags;
15794
15795  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
15796  if (Flags.getNode()) {
15797    SDValue Cond = DAG.getConstant(CC, MVT::i8);
15798    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
15799  }
15800
15801  Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget);
15802  if (Flags.getNode()) {
15803    SDValue Cond = DAG.getConstant(CC, MVT::i8);
15804    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
15805  }
15806
15807  return SDValue();
15808}
15809
15810// Optimize branch condition evaluation.
15811//
15812static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
15813                                    TargetLowering::DAGCombinerInfo &DCI,
15814                                    const X86Subtarget *Subtarget) {
15815  DebugLoc DL = N->getDebugLoc();
15816  SDValue Chain = N->getOperand(0);
15817  SDValue Dest = N->getOperand(1);
15818  SDValue EFLAGS = N->getOperand(3);
15819  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
15820
15821  SDValue Flags;
15822
15823  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
15824  if (Flags.getNode()) {
15825    SDValue Cond = DAG.getConstant(CC, MVT::i8);
15826    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
15827                       Flags);
15828  }
15829
15830  Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget);
15831  if (Flags.getNode()) {
15832    SDValue Cond = DAG.getConstant(CC, MVT::i8);
15833    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
15834                       Flags);
15835  }
15836
15837  return SDValue();
15838}
15839
15840static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG) {
15841  SDValue Op0 = N->getOperand(0);
15842  EVT InVT = Op0->getValueType(0);
15843
15844  // UINT_TO_FP(v4i8) -> SINT_TO_FP(ZEXT(v4i8 to v4i32))
15845  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
15846    DebugLoc dl = N->getDebugLoc();
15847    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
15848    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
15849    // Notice that we use SINT_TO_FP because we know that the high bits
15850    // are zero and SINT_TO_FP is better supported by the hardware.
15851    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
15852  }
15853
15854  return SDValue();
15855}
15856
15857static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
15858                                        const X86TargetLowering *XTLI) {
15859  SDValue Op0 = N->getOperand(0);
15860  EVT InVT = Op0->getValueType(0);
15861
15862  // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
15863  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
15864    DebugLoc dl = N->getDebugLoc();
15865    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
15866    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
15867    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
15868  }
15869
15870  // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
15871  // a 32-bit target where SSE doesn't support i64->FP operations.
15872  if (Op0.getOpcode() == ISD::LOAD) {
15873    LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
15874    EVT VT = Ld->getValueType(0);
15875    if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
15876        ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
15877        !XTLI->getSubtarget()->is64Bit() &&
15878        !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
15879      SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
15880                                          Ld->getChain(), Op0, DAG);
15881      DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
15882      return FILDChain;
15883    }
15884  }
15885  return SDValue();
15886}
15887
15888static SDValue PerformFP_TO_SINTCombine(SDNode *N, SelectionDAG &DAG) {
15889  EVT VT = N->getValueType(0);
15890
15891  // v4i8 = FP_TO_SINT() -> v4i8 = TRUNCATE (V4i32 = FP_TO_SINT()
15892  if (VT == MVT::v8i8 || VT == MVT::v4i8) {
15893    DebugLoc dl = N->getDebugLoc();
15894    MVT DstVT = VT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
15895    SDValue I = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, N->getOperand(0));
15896    return DAG.getNode(ISD::TRUNCATE, dl, VT, I);
15897  }
15898
15899  return SDValue();
15900}
15901
15902// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
15903static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
15904                                 X86TargetLowering::DAGCombinerInfo &DCI) {
15905  // If the LHS and RHS of the ADC node are zero, then it can't overflow and
15906  // the result is either zero or one (depending on the input carry bit).
15907  // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
15908  if (X86::isZeroNode(N->getOperand(0)) &&
15909      X86::isZeroNode(N->getOperand(1)) &&
15910      // We don't have a good way to replace an EFLAGS use, so only do this when
15911      // dead right now.
15912      SDValue(N, 1).use_empty()) {
15913    DebugLoc DL = N->getDebugLoc();
15914    EVT VT = N->getValueType(0);
15915    SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
15916    SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
15917                               DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
15918                                           DAG.getConstant(X86::COND_B,MVT::i8),
15919                                           N->getOperand(2)),
15920                               DAG.getConstant(1, VT));
15921    return DCI.CombineTo(N, Res1, CarryOut);
15922  }
15923
15924  return SDValue();
15925}
15926
15927// fold (add Y, (sete  X, 0)) -> adc  0, Y
15928//      (add Y, (setne X, 0)) -> sbb -1, Y
15929//      (sub (sete  X, 0), Y) -> sbb  0, Y
15930//      (sub (setne X, 0), Y) -> adc -1, Y
15931static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
15932  DebugLoc DL = N->getDebugLoc();
15933
15934  // Look through ZExts.
15935  SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
15936  if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
15937    return SDValue();
15938
15939  SDValue SetCC = Ext.getOperand(0);
15940  if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
15941    return SDValue();
15942
15943  X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
15944  if (CC != X86::COND_E && CC != X86::COND_NE)
15945    return SDValue();
15946
15947  SDValue Cmp = SetCC.getOperand(1);
15948  if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
15949      !X86::isZeroNode(Cmp.getOperand(1)) ||
15950      !Cmp.getOperand(0).getValueType().isInteger())
15951    return SDValue();
15952
15953  SDValue CmpOp0 = Cmp.getOperand(0);
15954  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
15955                               DAG.getConstant(1, CmpOp0.getValueType()));
15956
15957  SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
15958  if (CC == X86::COND_NE)
15959    return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
15960                       DL, OtherVal.getValueType(), OtherVal,
15961                       DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
15962  return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
15963                     DL, OtherVal.getValueType(), OtherVal,
15964                     DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
15965}
15966
15967/// PerformADDCombine - Do target-specific dag combines on integer adds.
15968static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
15969                                 const X86Subtarget *Subtarget) {
15970  EVT VT = N->getValueType(0);
15971  SDValue Op0 = N->getOperand(0);
15972  SDValue Op1 = N->getOperand(1);
15973
15974  // Try to synthesize horizontal adds from adds of shuffles.
15975  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
15976       (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
15977      isHorizontalBinOp(Op0, Op1, true))
15978    return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1);
15979
15980  return OptimizeConditionalInDecrement(N, DAG);
15981}
15982
15983static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
15984                                 const X86Subtarget *Subtarget) {
15985  SDValue Op0 = N->getOperand(0);
15986  SDValue Op1 = N->getOperand(1);
15987
15988  // X86 can't encode an immediate LHS of a sub. See if we can push the
15989  // negation into a preceding instruction.
15990  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
15991    // If the RHS of the sub is a XOR with one use and a constant, invert the
15992    // immediate. Then add one to the LHS of the sub so we can turn
15993    // X-Y -> X+~Y+1, saving one register.
15994    if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
15995        isa<ConstantSDNode>(Op1.getOperand(1))) {
15996      APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
15997      EVT VT = Op0.getValueType();
15998      SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT,
15999                                   Op1.getOperand(0),
16000                                   DAG.getConstant(~XorC, VT));
16001      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor,
16002                         DAG.getConstant(C->getAPIntValue()+1, VT));
16003    }
16004  }
16005
16006  // Try to synthesize horizontal adds from adds of shuffles.
16007  EVT VT = N->getValueType(0);
16008  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
16009       (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
16010      isHorizontalBinOp(Op0, Op1, true))
16011    return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1);
16012
16013  return OptimizeConditionalInDecrement(N, DAG);
16014}
16015
16016SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
16017                                             DAGCombinerInfo &DCI) const {
16018  SelectionDAG &DAG = DCI.DAG;
16019  switch (N->getOpcode()) {
16020  default: break;
16021  case ISD::EXTRACT_VECTOR_ELT:
16022    return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
16023  case ISD::VSELECT:
16024  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
16025  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
16026  case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
16027  case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
16028  case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
16029  case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
16030  case ISD::SHL:
16031  case ISD::SRA:
16032  case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
16033  case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
16034  case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
16035  case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
16036  case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
16037  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
16038  case ISD::UINT_TO_FP:     return PerformUINT_TO_FPCombine(N, DAG);
16039  case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
16040  case ISD::FP_TO_SINT:     return PerformFP_TO_SINTCombine(N, DAG);
16041  case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
16042  case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
16043  case X86ISD::FXOR:
16044  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
16045  case X86ISD::FMIN:
16046  case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
16047  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
16048  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
16049  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
16050  case ISD::ANY_EXTEND:
16051  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
16052  case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
16053  case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG, DCI);
16054  case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG);
16055  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
16056  case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
16057  case X86ISD::SHUFP:       // Handle all target specific shuffles
16058  case X86ISD::PALIGN:
16059  case X86ISD::UNPCKH:
16060  case X86ISD::UNPCKL:
16061  case X86ISD::MOVHLPS:
16062  case X86ISD::MOVLHPS:
16063  case X86ISD::PSHUFD:
16064  case X86ISD::PSHUFHW:
16065  case X86ISD::PSHUFLW:
16066  case X86ISD::MOVSS:
16067  case X86ISD::MOVSD:
16068  case X86ISD::VPERMILP:
16069  case X86ISD::VPERM2X128:
16070  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
16071  case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
16072  }
16073
16074  return SDValue();
16075}
16076
16077/// isTypeDesirableForOp - Return true if the target has native support for
16078/// the specified value type and it is 'desirable' to use the type for the
16079/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
16080/// instruction encodings are longer and some i16 instructions are slow.
16081bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
16082  if (!isTypeLegal(VT))
16083    return false;
16084  if (VT != MVT::i16)
16085    return true;
16086
16087  switch (Opc) {
16088  default:
16089    return true;
16090  case ISD::LOAD:
16091  case ISD::SIGN_EXTEND:
16092  case ISD::ZERO_EXTEND:
16093  case ISD::ANY_EXTEND:
16094  case ISD::SHL:
16095  case ISD::SRL:
16096  case ISD::SUB:
16097  case ISD::ADD:
16098  case ISD::MUL:
16099  case ISD::AND:
16100  case ISD::OR:
16101  case ISD::XOR:
16102    return false;
16103  }
16104}
16105
16106/// IsDesirableToPromoteOp - This method query the target whether it is
16107/// beneficial for dag combiner to promote the specified node. If true, it
16108/// should return the desired promotion type by reference.
16109bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
16110  EVT VT = Op.getValueType();
16111  if (VT != MVT::i16)
16112    return false;
16113
16114  bool Promote = false;
16115  bool Commute = false;
16116  switch (Op.getOpcode()) {
16117  default: break;
16118  case ISD::LOAD: {
16119    LoadSDNode *LD = cast<LoadSDNode>(Op);
16120    // If the non-extending load has a single use and it's not live out, then it
16121    // might be folded.
16122    if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
16123                                                     Op.hasOneUse()*/) {
16124      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16125             UE = Op.getNode()->use_end(); UI != UE; ++UI) {
16126        // The only case where we'd want to promote LOAD (rather then it being
16127        // promoted as an operand is when it's only use is liveout.
16128        if (UI->getOpcode() != ISD::CopyToReg)
16129          return false;
16130      }
16131    }
16132    Promote = true;
16133    break;
16134  }
16135  case ISD::SIGN_EXTEND:
16136  case ISD::ZERO_EXTEND:
16137  case ISD::ANY_EXTEND:
16138    Promote = true;
16139    break;
16140  case ISD::SHL:
16141  case ISD::SRL: {
16142    SDValue N0 = Op.getOperand(0);
16143    // Look out for (store (shl (load), x)).
16144    if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
16145      return false;
16146    Promote = true;
16147    break;
16148  }
16149  case ISD::ADD:
16150  case ISD::MUL:
16151  case ISD::AND:
16152  case ISD::OR:
16153  case ISD::XOR:
16154    Commute = true;
16155    // fallthrough
16156  case ISD::SUB: {
16157    SDValue N0 = Op.getOperand(0);
16158    SDValue N1 = Op.getOperand(1);
16159    if (!Commute && MayFoldLoad(N1))
16160      return false;
16161    // Avoid disabling potential load folding opportunities.
16162    if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
16163      return false;
16164    if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
16165      return false;
16166    Promote = true;
16167  }
16168  }
16169
16170  PVT = MVT::i32;
16171  return Promote;
16172}
16173
16174//===----------------------------------------------------------------------===//
16175//                           X86 Inline Assembly Support
16176//===----------------------------------------------------------------------===//
16177
16178namespace {
16179  // Helper to match a string separated by whitespace.
16180  bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
16181    s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
16182
16183    for (unsigned i = 0, e = args.size(); i != e; ++i) {
16184      StringRef piece(*args[i]);
16185      if (!s.startswith(piece)) // Check if the piece matches.
16186        return false;
16187
16188      s = s.substr(piece.size());
16189      StringRef::size_type pos = s.find_first_not_of(" \t");
16190      if (pos == 0) // We matched a prefix.
16191        return false;
16192
16193      s = s.substr(pos);
16194    }
16195
16196    return s.empty();
16197  }
16198  const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
16199}
16200
16201bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
16202  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
16203
16204  std::string AsmStr = IA->getAsmString();
16205
16206  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
16207  if (!Ty || Ty->getBitWidth() % 16 != 0)
16208    return false;
16209
16210  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
16211  SmallVector<StringRef, 4> AsmPieces;
16212  SplitString(AsmStr, AsmPieces, ";\n");
16213
16214  switch (AsmPieces.size()) {
16215  default: return false;
16216  case 1:
16217    // FIXME: this should verify that we are targeting a 486 or better.  If not,
16218    // we will turn this bswap into something that will be lowered to logical
16219    // ops instead of emitting the bswap asm.  For now, we don't support 486 or
16220    // lower so don't worry about this.
16221    // bswap $0
16222    if (matchAsm(AsmPieces[0], "bswap", "$0") ||
16223        matchAsm(AsmPieces[0], "bswapl", "$0") ||
16224        matchAsm(AsmPieces[0], "bswapq", "$0") ||
16225        matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
16226        matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
16227        matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
16228      // No need to check constraints, nothing other than the equivalent of
16229      // "=r,0" would be valid here.
16230      return IntrinsicLowering::LowerToByteSwap(CI);
16231    }
16232
16233    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
16234    if (CI->getType()->isIntegerTy(16) &&
16235        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
16236        (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
16237         matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
16238      AsmPieces.clear();
16239      const std::string &ConstraintsStr = IA->getConstraintString();
16240      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
16241      std::sort(AsmPieces.begin(), AsmPieces.end());
16242      if (AsmPieces.size() == 4 &&
16243          AsmPieces[0] == "~{cc}" &&
16244          AsmPieces[1] == "~{dirflag}" &&
16245          AsmPieces[2] == "~{flags}" &&
16246          AsmPieces[3] == "~{fpsr}")
16247      return IntrinsicLowering::LowerToByteSwap(CI);
16248    }
16249    break;
16250  case 3:
16251    if (CI->getType()->isIntegerTy(32) &&
16252        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
16253        matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
16254        matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
16255        matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
16256      AsmPieces.clear();
16257      const std::string &ConstraintsStr = IA->getConstraintString();
16258      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
16259      std::sort(AsmPieces.begin(), AsmPieces.end());
16260      if (AsmPieces.size() == 4 &&
16261          AsmPieces[0] == "~{cc}" &&
16262          AsmPieces[1] == "~{dirflag}" &&
16263          AsmPieces[2] == "~{flags}" &&
16264          AsmPieces[3] == "~{fpsr}")
16265        return IntrinsicLowering::LowerToByteSwap(CI);
16266    }
16267
16268    if (CI->getType()->isIntegerTy(64)) {
16269      InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
16270      if (Constraints.size() >= 2 &&
16271          Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
16272          Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
16273        // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
16274        if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
16275            matchAsm(AsmPieces[1], "bswap", "%edx") &&
16276            matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
16277          return IntrinsicLowering::LowerToByteSwap(CI);
16278      }
16279    }
16280    break;
16281  }
16282  return false;
16283}
16284
16285
16286
16287/// getConstraintType - Given a constraint letter, return the type of
16288/// constraint it is for this target.
16289X86TargetLowering::ConstraintType
16290X86TargetLowering::getConstraintType(const std::string &Constraint) const {
16291  if (Constraint.size() == 1) {
16292    switch (Constraint[0]) {
16293    case 'R':
16294    case 'q':
16295    case 'Q':
16296    case 'f':
16297    case 't':
16298    case 'u':
16299    case 'y':
16300    case 'x':
16301    case 'Y':
16302    case 'l':
16303      return C_RegisterClass;
16304    case 'a':
16305    case 'b':
16306    case 'c':
16307    case 'd':
16308    case 'S':
16309    case 'D':
16310    case 'A':
16311      return C_Register;
16312    case 'I':
16313    case 'J':
16314    case 'K':
16315    case 'L':
16316    case 'M':
16317    case 'N':
16318    case 'G':
16319    case 'C':
16320    case 'e':
16321    case 'Z':
16322      return C_Other;
16323    default:
16324      break;
16325    }
16326  }
16327  return TargetLowering::getConstraintType(Constraint);
16328}
16329
16330/// Examine constraint type and operand type and determine a weight value.
16331/// This object must already have been set up with the operand type
16332/// and the current alternative constraint selected.
16333TargetLowering::ConstraintWeight
16334  X86TargetLowering::getSingleConstraintMatchWeight(
16335    AsmOperandInfo &info, const char *constraint) const {
16336  ConstraintWeight weight = CW_Invalid;
16337  Value *CallOperandVal = info.CallOperandVal;
16338    // If we don't have a value, we can't do a match,
16339    // but allow it at the lowest weight.
16340  if (CallOperandVal == NULL)
16341    return CW_Default;
16342  Type *type = CallOperandVal->getType();
16343  // Look at the constraint type.
16344  switch (*constraint) {
16345  default:
16346    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
16347  case 'R':
16348  case 'q':
16349  case 'Q':
16350  case 'a':
16351  case 'b':
16352  case 'c':
16353  case 'd':
16354  case 'S':
16355  case 'D':
16356  case 'A':
16357    if (CallOperandVal->getType()->isIntegerTy())
16358      weight = CW_SpecificReg;
16359    break;
16360  case 'f':
16361  case 't':
16362  case 'u':
16363      if (type->isFloatingPointTy())
16364        weight = CW_SpecificReg;
16365      break;
16366  case 'y':
16367      if (type->isX86_MMXTy() && Subtarget->hasMMX())
16368        weight = CW_SpecificReg;
16369      break;
16370  case 'x':
16371  case 'Y':
16372    if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
16373        ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasAVX()))
16374      weight = CW_Register;
16375    break;
16376  case 'I':
16377    if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
16378      if (C->getZExtValue() <= 31)
16379        weight = CW_Constant;
16380    }
16381    break;
16382  case 'J':
16383    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
16384      if (C->getZExtValue() <= 63)
16385        weight = CW_Constant;
16386    }
16387    break;
16388  case 'K':
16389    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
16390      if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
16391        weight = CW_Constant;
16392    }
16393    break;
16394  case 'L':
16395    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
16396      if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
16397        weight = CW_Constant;
16398    }
16399    break;
16400  case 'M':
16401    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
16402      if (C->getZExtValue() <= 3)
16403        weight = CW_Constant;
16404    }
16405    break;
16406  case 'N':
16407    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
16408      if (C->getZExtValue() <= 0xff)
16409        weight = CW_Constant;
16410    }
16411    break;
16412  case 'G':
16413  case 'C':
16414    if (dyn_cast<ConstantFP>(CallOperandVal)) {
16415      weight = CW_Constant;
16416    }
16417    break;
16418  case 'e':
16419    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
16420      if ((C->getSExtValue() >= -0x80000000LL) &&
16421          (C->getSExtValue() <= 0x7fffffffLL))
16422        weight = CW_Constant;
16423    }
16424    break;
16425  case 'Z':
16426    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
16427      if (C->getZExtValue() <= 0xffffffff)
16428        weight = CW_Constant;
16429    }
16430    break;
16431  }
16432  return weight;
16433}
16434
16435/// LowerXConstraint - try to replace an X constraint, which matches anything,
16436/// with another that has more specific requirements based on the type of the
16437/// corresponding operand.
16438const char *X86TargetLowering::
16439LowerXConstraint(EVT ConstraintVT) const {
16440  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
16441  // 'f' like normal targets.
16442  if (ConstraintVT.isFloatingPoint()) {
16443    if (Subtarget->hasSSE2())
16444      return "Y";
16445    if (Subtarget->hasSSE1())
16446      return "x";
16447  }
16448
16449  return TargetLowering::LowerXConstraint(ConstraintVT);
16450}
16451
16452/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
16453/// vector.  If it is invalid, don't add anything to Ops.
16454void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
16455                                                     std::string &Constraint,
16456                                                     std::vector<SDValue>&Ops,
16457                                                     SelectionDAG &DAG) const {
16458  SDValue Result(0, 0);
16459
16460  // Only support length 1 constraints for now.
16461  if (Constraint.length() > 1) return;
16462
16463  char ConstraintLetter = Constraint[0];
16464  switch (ConstraintLetter) {
16465  default: break;
16466  case 'I':
16467    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
16468      if (C->getZExtValue() <= 31) {
16469        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
16470        break;
16471      }
16472    }
16473    return;
16474  case 'J':
16475    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
16476      if (C->getZExtValue() <= 63) {
16477        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
16478        break;
16479      }
16480    }
16481    return;
16482  case 'K':
16483    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
16484      if ((int8_t)C->getSExtValue() == C->getSExtValue()) {
16485        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
16486        break;
16487      }
16488    }
16489    return;
16490  case 'N':
16491    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
16492      if (C->getZExtValue() <= 255) {
16493        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
16494        break;
16495      }
16496    }
16497    return;
16498  case 'e': {
16499    // 32-bit signed value
16500    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
16501      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
16502                                           C->getSExtValue())) {
16503        // Widen to 64 bits here to get it sign extended.
16504        Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
16505        break;
16506      }
16507    // FIXME gcc accepts some relocatable values here too, but only in certain
16508    // memory models; it's complicated.
16509    }
16510    return;
16511  }
16512  case 'Z': {
16513    // 32-bit unsigned value
16514    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
16515      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
16516                                           C->getZExtValue())) {
16517        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
16518        break;
16519      }
16520    }
16521    // FIXME gcc accepts some relocatable values here too, but only in certain
16522    // memory models; it's complicated.
16523    return;
16524  }
16525  case 'i': {
16526    // Literal immediates are always ok.
16527    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
16528      // Widen to 64 bits here to get it sign extended.
16529      Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
16530      break;
16531    }
16532
16533    // In any sort of PIC mode addresses need to be computed at runtime by
16534    // adding in a register or some sort of table lookup.  These can't
16535    // be used as immediates.
16536    if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
16537      return;
16538
16539    // If we are in non-pic codegen mode, we allow the address of a global (with
16540    // an optional displacement) to be used with 'i'.
16541    GlobalAddressSDNode *GA = 0;
16542    int64_t Offset = 0;
16543
16544    // Match either (GA), (GA+C), (GA+C1+C2), etc.
16545    while (1) {
16546      if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
16547        Offset += GA->getOffset();
16548        break;
16549      } else if (Op.getOpcode() == ISD::ADD) {
16550        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
16551          Offset += C->getZExtValue();
16552          Op = Op.getOperand(0);
16553          continue;
16554        }
16555      } else if (Op.getOpcode() == ISD::SUB) {
16556        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
16557          Offset += -C->getZExtValue();
16558          Op = Op.getOperand(0);
16559          continue;
16560        }
16561      }
16562
16563      // Otherwise, this isn't something we can handle, reject it.
16564      return;
16565    }
16566
16567    const GlobalValue *GV = GA->getGlobal();
16568    // If we require an extra load to get this address, as in PIC mode, we
16569    // can't accept it.
16570    if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
16571                                                        getTargetMachine())))
16572      return;
16573
16574    Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
16575                                        GA->getValueType(0), Offset);
16576    break;
16577  }
16578  }
16579
16580  if (Result.getNode()) {
16581    Ops.push_back(Result);
16582    return;
16583  }
16584  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
16585}
16586
16587std::pair<unsigned, const TargetRegisterClass*>
16588X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
16589                                                EVT VT) const {
16590  // First, see if this is a constraint that directly corresponds to an LLVM
16591  // register class.
16592  if (Constraint.size() == 1) {
16593    // GCC Constraint Letters
16594    switch (Constraint[0]) {
16595    default: break;
16596      // TODO: Slight differences here in allocation order and leaving
16597      // RIP in the class. Do they matter any more here than they do
16598      // in the normal allocation?
16599    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
16600      if (Subtarget->is64Bit()) {
16601        if (VT == MVT::i32 || VT == MVT::f32)
16602          return std::make_pair(0U, &X86::GR32RegClass);
16603        if (VT == MVT::i16)
16604          return std::make_pair(0U, &X86::GR16RegClass);
16605        if (VT == MVT::i8 || VT == MVT::i1)
16606          return std::make_pair(0U, &X86::GR8RegClass);
16607        if (VT == MVT::i64 || VT == MVT::f64)
16608          return std::make_pair(0U, &X86::GR64RegClass);
16609        break;
16610      }
16611      // 32-bit fallthrough
16612    case 'Q':   // Q_REGS
16613      if (VT == MVT::i32 || VT == MVT::f32)
16614        return std::make_pair(0U, &X86::GR32_ABCDRegClass);
16615      if (VT == MVT::i16)
16616        return std::make_pair(0U, &X86::GR16_ABCDRegClass);
16617      if (VT == MVT::i8 || VT == MVT::i1)
16618        return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
16619      if (VT == MVT::i64)
16620        return std::make_pair(0U, &X86::GR64_ABCDRegClass);
16621      break;
16622    case 'r':   // GENERAL_REGS
16623    case 'l':   // INDEX_REGS
16624      if (VT == MVT::i8 || VT == MVT::i1)
16625        return std::make_pair(0U, &X86::GR8RegClass);
16626      if (VT == MVT::i16)
16627        return std::make_pair(0U, &X86::GR16RegClass);
16628      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
16629        return std::make_pair(0U, &X86::GR32RegClass);
16630      return std::make_pair(0U, &X86::GR64RegClass);
16631    case 'R':   // LEGACY_REGS
16632      if (VT == MVT::i8 || VT == MVT::i1)
16633        return std::make_pair(0U, &X86::GR8_NOREXRegClass);
16634      if (VT == MVT::i16)
16635        return std::make_pair(0U, &X86::GR16_NOREXRegClass);
16636      if (VT == MVT::i32 || !Subtarget->is64Bit())
16637        return std::make_pair(0U, &X86::GR32_NOREXRegClass);
16638      return std::make_pair(0U, &X86::GR64_NOREXRegClass);
16639    case 'f':  // FP Stack registers.
16640      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
16641      // value to the correct fpstack register class.
16642      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
16643        return std::make_pair(0U, &X86::RFP32RegClass);
16644      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
16645        return std::make_pair(0U, &X86::RFP64RegClass);
16646      return std::make_pair(0U, &X86::RFP80RegClass);
16647    case 'y':   // MMX_REGS if MMX allowed.
16648      if (!Subtarget->hasMMX()) break;
16649      return std::make_pair(0U, &X86::VR64RegClass);
16650    case 'Y':   // SSE_REGS if SSE2 allowed
16651      if (!Subtarget->hasSSE2()) break;
16652      // FALL THROUGH.
16653    case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
16654      if (!Subtarget->hasSSE1()) break;
16655
16656      switch (VT.getSimpleVT().SimpleTy) {
16657      default: break;
16658      // Scalar SSE types.
16659      case MVT::f32:
16660      case MVT::i32:
16661        return std::make_pair(0U, &X86::FR32RegClass);
16662      case MVT::f64:
16663      case MVT::i64:
16664        return std::make_pair(0U, &X86::FR64RegClass);
16665      // Vector types.
16666      case MVT::v16i8:
16667      case MVT::v8i16:
16668      case MVT::v4i32:
16669      case MVT::v2i64:
16670      case MVT::v4f32:
16671      case MVT::v2f64:
16672        return std::make_pair(0U, &X86::VR128RegClass);
16673      // AVX types.
16674      case MVT::v32i8:
16675      case MVT::v16i16:
16676      case MVT::v8i32:
16677      case MVT::v4i64:
16678      case MVT::v8f32:
16679      case MVT::v4f64:
16680        return std::make_pair(0U, &X86::VR256RegClass);
16681      }
16682      break;
16683    }
16684  }
16685
16686  // Use the default implementation in TargetLowering to convert the register
16687  // constraint into a member of a register class.
16688  std::pair<unsigned, const TargetRegisterClass*> Res;
16689  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
16690
16691  // Not found as a standard register?
16692  if (Res.second == 0) {
16693    // Map st(0) -> st(7) -> ST0
16694    if (Constraint.size() == 7 && Constraint[0] == '{' &&
16695        tolower(Constraint[1]) == 's' &&
16696        tolower(Constraint[2]) == 't' &&
16697        Constraint[3] == '(' &&
16698        (Constraint[4] >= '0' && Constraint[4] <= '7') &&
16699        Constraint[5] == ')' &&
16700        Constraint[6] == '}') {
16701
16702      Res.first = X86::ST0+Constraint[4]-'0';
16703      Res.second = &X86::RFP80RegClass;
16704      return Res;
16705    }
16706
16707    // GCC allows "st(0)" to be called just plain "st".
16708    if (StringRef("{st}").equals_lower(Constraint)) {
16709      Res.first = X86::ST0;
16710      Res.second = &X86::RFP80RegClass;
16711      return Res;
16712    }
16713
16714    // flags -> EFLAGS
16715    if (StringRef("{flags}").equals_lower(Constraint)) {
16716      Res.first = X86::EFLAGS;
16717      Res.second = &X86::CCRRegClass;
16718      return Res;
16719    }
16720
16721    // 'A' means EAX + EDX.
16722    if (Constraint == "A") {
16723      Res.first = X86::EAX;
16724      Res.second = &X86::GR32_ADRegClass;
16725      return Res;
16726    }
16727    return Res;
16728  }
16729
16730  // Otherwise, check to see if this is a register class of the wrong value
16731  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
16732  // turn into {ax},{dx}.
16733  if (Res.second->hasType(VT))
16734    return Res;   // Correct type already, nothing to do.
16735
16736  // All of the single-register GCC register classes map their values onto
16737  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
16738  // really want an 8-bit or 32-bit register, map to the appropriate register
16739  // class and return the appropriate register.
16740  if (Res.second == &X86::GR16RegClass) {
16741    if (VT == MVT::i8) {
16742      unsigned DestReg = 0;
16743      switch (Res.first) {
16744      default: break;
16745      case X86::AX: DestReg = X86::AL; break;
16746      case X86::DX: DestReg = X86::DL; break;
16747      case X86::CX: DestReg = X86::CL; break;
16748      case X86::BX: DestReg = X86::BL; break;
16749      }
16750      if (DestReg) {
16751        Res.first = DestReg;
16752        Res.second = &X86::GR8RegClass;
16753      }
16754    } else if (VT == MVT::i32) {
16755      unsigned DestReg = 0;
16756      switch (Res.first) {
16757      default: break;
16758      case X86::AX: DestReg = X86::EAX; break;
16759      case X86::DX: DestReg = X86::EDX; break;
16760      case X86::CX: DestReg = X86::ECX; break;
16761      case X86::BX: DestReg = X86::EBX; break;
16762      case X86::SI: DestReg = X86::ESI; break;
16763      case X86::DI: DestReg = X86::EDI; break;
16764      case X86::BP: DestReg = X86::EBP; break;
16765      case X86::SP: DestReg = X86::ESP; break;
16766      }
16767      if (DestReg) {
16768        Res.first = DestReg;
16769        Res.second = &X86::GR32RegClass;
16770      }
16771    } else if (VT == MVT::i64) {
16772      unsigned DestReg = 0;
16773      switch (Res.first) {
16774      default: break;
16775      case X86::AX: DestReg = X86::RAX; break;
16776      case X86::DX: DestReg = X86::RDX; break;
16777      case X86::CX: DestReg = X86::RCX; break;
16778      case X86::BX: DestReg = X86::RBX; break;
16779      case X86::SI: DestReg = X86::RSI; break;
16780      case X86::DI: DestReg = X86::RDI; break;
16781      case X86::BP: DestReg = X86::RBP; break;
16782      case X86::SP: DestReg = X86::RSP; break;
16783      }
16784      if (DestReg) {
16785        Res.first = DestReg;
16786        Res.second = &X86::GR64RegClass;
16787      }
16788    }
16789  } else if (Res.second == &X86::FR32RegClass ||
16790             Res.second == &X86::FR64RegClass ||
16791             Res.second == &X86::VR128RegClass) {
16792    // Handle references to XMM physical registers that got mapped into the
16793    // wrong class.  This can happen with constraints like {xmm0} where the
16794    // target independent register mapper will just pick the first match it can
16795    // find, ignoring the required type.
16796
16797    if (VT == MVT::f32 || VT == MVT::i32)
16798      Res.second = &X86::FR32RegClass;
16799    else if (VT == MVT::f64 || VT == MVT::i64)
16800      Res.second = &X86::FR64RegClass;
16801    else if (X86::VR128RegClass.hasType(VT))
16802      Res.second = &X86::VR128RegClass;
16803    else if (X86::VR256RegClass.hasType(VT))
16804      Res.second = &X86::VR256RegClass;
16805  }
16806
16807  return Res;
16808}
16809